Skip to content

The order of assembly code output from attached IL is broken when optimisation is O1 or greater #83

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
carlos4242 opened this issue Oct 22, 2017 · 7 comments

Comments

@carlos4242
Copy link

carlos4242 commented Oct 22, 2017

Working from this minimal example, I've worked out Swift code that ends up broken when compiled into AVR assembly, if and only if O1 or greater.

var brightness: Int16 = 0
var fadeAmount: Int16 = 5

while(true) {
  brightness = brightness &+ fadeAmount
  if (brightness <= 0 || brightness >= 255) {
    if fadeAmount > 0 {
      fadeAmount = -5
    }
  }
}

This produces the following LLVM IR when run through the swiftc in Xcode 8.3.3 (swiftc version Apple Swift version 3.1 (swiftlang-802.0.53 clang-802.0.42)).

(Note I've amended this LLVM IR with comments and with test labels so it makes a valid test in llvm-lit if placed in test/CodeGen/AVR.)

; RUN: llc -O=1 -mattr=lpm < %s -march=avr | FileCheck %s

; ModuleID = 'il-ordering-bug1-iffy8.ll'
source_filename = "il-ordering-bug1-iffy8.ll"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9"

%Vs5Int16 = type <{ i16 }>

@_Tv4main10brightnessVs5Int16 = hidden global %Vs5Int16 zeroinitializer, align 2
@_Tv4main10fadeAmountVs5Int16 = hidden global %Vs5Int16 zeroinitializer, align 2
@__swift_reflection_version = linkonce_odr hidden constant i16 1
@llvm.used = appending global [1 x i8*] [i8* bitcast (i16* @__swift_reflection_version to i8*)], section "llvm.metadata", align 8

; CHECK-LABEL: main:
define i32 @main(i32, i8**) #0 {
entry:
  %2 = bitcast i8** %1 to i8*
  ; CHECK:      ldi r24, 0
  ; CHECK-NEXT: ldi r25, 0
  ; CHECK-NEXT: sts _Tv4main10brightnessVs5Int16+1, r25
  ; CHECK-NEXT: sts _Tv4main10brightnessVs5Int16, r24
  store i16 0, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  store i16 5, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2

  ; the first label should occur after the global variables are initialised
  ; CHECK-LABEL: LBB0_1:
  br label %3

; <label>:3:                                      ; preds = %26, %entry
  br label %4

; <label>:4:                                      ; preds = %3
  %5 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %6 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
; the llvm.sadd.with.overflow function takes two int16 values
; and returns a struct with two values, the added value and a bool indicating overflow
  %7 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 %5, i16 %6)
  %8 = extractvalue { i16, i1 } %7, 0 ; final added value
  %9 = extractvalue { i16, i1 } %7, 1 ; overflow flag
; then stores it back in brightness
  store i16 %8, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2

; if (brightness<=0)
  %10 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %11 = icmp sle i16 %10, 0
  br i1 %11, label %12, label %15

; <label>:12:                                     ; preds = %4
  br label %13

; <label>:13:                                     ; preds = %12, %19
; load %14 with true if we got here from <label>:12:, otherwise load it with %18 if we got here from ; <label>:19:
  %14 = phi i1 [ %18, %19 ], [ true, %12 ]
  br label %20

; <label>:15:                                     ; preds = %4
  br label %16

; <label>:16:                                     ; preds = %15
; %18 = if (brightness>=255)
  %17 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %18 = icmp sge i16 %17, 255
  br label %19

; <label>:19:                                     ; preds = %16
  br label %13

; <label>:20:                                     ; preds = %13
; if (brightness<=0||brightness>=255)
  br i1 %14, label %21, label %26

; <label>:21:                                     ; preds = %20
  %22 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  %23 = icmp sgt i16 %22, 0
  br i1 %23, label %24, label %25

; <label>:24:                                     ; preds = %21
  store i16 -5, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  br label %25

; <label>:25:                                     ; preds = %24, %21
  br label %26

; CHECK: rjmp LBB0_1{{$}}
; CHECK-NEXT: .Lfunc_end0
; if (brightness>0&&brightness<255)
; <label>:26:                                     ; preds = %25, %20
  br label %3
}

; Function Attrs: nounwind readnone
declare { i16, i1 } @llvm.sadd.with.overflow.i16(i16, i16) #1

attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "target-cpu"="core2" "target-features"="+ssse3,+cx16,+fxsr,+mmx,+x87,+sse,+sse2,+sse3" }
attributes #1 = { nounwind readnone }

!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !9, !10}

!0 = !{i32 1, !"Objective-C Version", i32 2}
!1 = !{i32 1, !"Objective-C Image Info Version", i32 0}
!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
!3 = !{i32 4, !"Objective-C Garbage Collection", i32 1024}
!4 = !{i32 1, !"Objective-C Class Properties", i32 64}
!5 = !{i32 6, !"Linker Options", !6}
!6 = !{!7, !8}
!7 = !{!"-lswiftCore"}
!8 = !{!"-lobjc"}
!9 = !{i32 1, !"PIC Level", i32 2}
!10 = !{i32 1, !"Swift Version", i32 4}

When compiled with -O=0, this produces very cumbersome but valid assembly:

	.text
	.macosx_version_min 10, 9
	.file	"<stdin>"
	.globl	main
	.p2align	1
	.type	main,@function
main:                                   ; @main
; BB#0:                                 ; %entry
	push	r28
	push	r29
	in	r28, 61
	in	r29, 62
	sbiw	r28, 7
	in	r0, 63
	cli
	out	62, r29
	out	63, r0
	out	61, r28
	ldi	r24, 0
	ldi	r25, 0
	sts	_Tv4main10brightnessVs5Int16+1, r25
	sts	_Tv4main10brightnessVs5Int16, r24
	ldi	r24, 5
	ldi	r25, 0
	sts	_Tv4main10fadeAmountVs5Int16+1, r25
	sts	_Tv4main10fadeAmountVs5Int16, r24
	std	Y+6, r20                ; 2-byte Folded Spill
	std	Y+7, r21                ; 2-byte Folded Spill
	rjmp	LBB0_1
LBB0_1:                                 ; =>This Inner Loop Header: Depth=1
	rjmp	LBB0_2
LBB0_2:                                 ;   in Loop: Header=BB0_1 Depth=1
	lds	r24, _Tv4main10brightnessVs5Int16
	lds	r25, _Tv4main10brightnessVs5Int16+1
	lds	r18, _Tv4main10fadeAmountVs5Int16
	lds	r19, _Tv4main10fadeAmountVs5Int16+1
	add	r24, r18
	adc	r25, r19
	sts	_Tv4main10brightnessVs5Int16+1, r25
	sts	_Tv4main10brightnessVs5Int16, r24
	lds	r24, _Tv4main10brightnessVs5Int16
	lds	r25, _Tv4main10brightnessVs5Int16+1
	ldi	r18, 0
	ldi	r19, 0
	cp	r18, r24
	cpc	r19, r25
	brlt	LBB0_5
	rjmp	LBB0_3
LBB0_3:                                 ;   in Loop: Header=BB0_1 Depth=1
	ldi	r24, 1
	std	Y+5, r24                ; 1-byte Folded Spill
	rjmp	LBB0_4
LBB0_4:                                 ;   in Loop: Header=BB0_1 Depth=1
	ldd	r24, Y+5                ; 1-byte Folded Reload
	std	Y+4, r24                ; 1-byte Folded Spill
	rjmp	LBB0_10
LBB0_5:                                 ;   in Loop: Header=BB0_1 Depth=1
	rjmp	LBB0_6
LBB0_6:                                 ;   in Loop: Header=BB0_1 Depth=1
	lds	r24, _Tv4main10brightnessVs5Int16
	lds	r25, _Tv4main10brightnessVs5Int16+1
	ldi	r18, 255
	ldi	r19, 0
	ldi	r20, 0
	ldi	r21, 1
	cp	r24, r18
	cpc	r25, r19
	std	Y+3, r20                ; 1-byte Folded Spill
	std	Y+2, r21                ; 1-byte Folded Spill
	brge	LBB0_7
	rjmp	LBB0_8
LBB0_7:                                 ;   in Loop: Header=BB0_1 Depth=1
	ldd	r24, Y+2                ; 1-byte Folded Reload
	std	Y+1, r24                ; 1-byte Folded Spill
	rjmp	LBB0_9
LBB0_8:                                 ;   in Loop: Header=BB0_1 Depth=1
	ldd	r24, Y+3                ; 1-byte Folded Reload
	std	Y+2, r24                ; 1-byte Folded Spill
	rjmp	LBB0_7
LBB0_9:                                 ;   in Loop: Header=BB0_1 Depth=1
	ldd	r24, Y+1                ; 1-byte Folded Reload
	std	Y+5, r24                ; 1-byte Folded Spill
	rjmp	LBB0_4
LBB0_10:                                ;   in Loop: Header=BB0_1 Depth=1
	ldd	r24, Y+4                ; 1-byte Folded Reload
	andi	r24, 1
	cpi	r24, 0
	breq	LBB0_14
	rjmp	LBB0_11
LBB0_11:                                ;   in Loop: Header=BB0_1 Depth=1
	lds	r24, _Tv4main10fadeAmountVs5Int16
	lds	r25, _Tv4main10fadeAmountVs5Int16+1
	ldi	r18, 0
	ldi	r19, 0
	cp	r18, r24
	cpc	r19, r25
	brge	LBB0_13
	rjmp	LBB0_12
LBB0_12:                                ;   in Loop: Header=BB0_1 Depth=1
	ldi	r24, 251
	ldi	r25, 255
	sts	_Tv4main10fadeAmountVs5Int16+1, r25
	sts	_Tv4main10fadeAmountVs5Int16, r24
	rjmp	LBB0_13
LBB0_13:                                ;   in Loop: Header=BB0_1 Depth=1
	rjmp	LBB0_14
LBB0_14:                                ;   in Loop: Header=BB0_1 Depth=1
	rjmp	LBB0_1
.Lfunc_end0:
	.size	main, .Lfunc_end0-main

	.hidden	_Tv4main10brightnessVs5Int16 ; @_Tv4main10brightnessVs5Int16
	.type	_Tv4main10brightnessVs5Int16,@object
	.section	.bss,"aw",@nobits
	.globl	_Tv4main10brightnessVs5Int16
	.p2align	1
_Tv4main10brightnessVs5Int16:
	.zero	2
	.size	_Tv4main10brightnessVs5Int16, 2

	.hidden	_Tv4main10fadeAmountVs5Int16 ; @_Tv4main10fadeAmountVs5Int16
	.type	_Tv4main10fadeAmountVs5Int16,@object
	.globl	_Tv4main10fadeAmountVs5Int16
	.p2align	1
_Tv4main10fadeAmountVs5Int16:
	.zero	2
	.size	_Tv4main10fadeAmountVs5Int16, 2

	.hidden	__swift_reflection_version ; @__swift_reflection_version
	.type	__swift_reflection_version,@object
	.section	.rodata,"a",@progbits
	.weak	__swift_reflection_version
__swift_reflection_version:
	.short	1                       ; 0x1
	.size	__swift_reflection_version, 2


	; Declaring this symbol tells the CRT that it should
	;copy all variables from program memory to RAM on startup
	.globl	__do_copy_data
	; Declaring this symbol tells the CRT that it should
	;clear the zeroed data section on startup
	.globl	__do_clear_bss

When compiled with -O=1 we get this assembly, which is much more efficient but doesn't work. In particular the while (true) {} loop is effectively broken.

The rjmp jump back to LBB0_1 should (probably?) be the last instruction. Instead there's another couple of labels after that. Looks like some reordering went wrong during optimisation?

	.text
	.macosx_version_min 10, 9
	.file	"<stdin>"
	.globl	main
	.p2align	1
	.type	main,@function
main:                                   ; @main
; BB#0:                                 ; %entry
	ldi	r24, 0
	ldi	r25, 0
	sts	_Tv4main10brightnessVs5Int16+1, r25
	sts	_Tv4main10brightnessVs5Int16, r24
	ldi	r18, 5
	ldi	r19, 0
	sts	_Tv4main10fadeAmountVs5Int16+1, r19
	sts	_Tv4main10fadeAmountVs5Int16, r18
	ldi	r18, 251
	ldi	r19, 255
	ldi	r20, 255
	ldi	r21, 0
	rjmp	LBB0_2
LBB0_1:                                 ;   in Loop: Header=BB0_2 Depth=1
	sts	_Tv4main10fadeAmountVs5Int16+1, r19
	sts	_Tv4main10fadeAmountVs5Int16, r18
LBB0_2:                                 ; =>This Inner Loop Header: Depth=1
	lds	r22, _Tv4main10fadeAmountVs5Int16
	lds	r23, _Tv4main10fadeAmountVs5Int16+1
	lds	r30, _Tv4main10brightnessVs5Int16
	lds	r31, _Tv4main10brightnessVs5Int16+1
	add	r30, r22
	adc	r31, r23
	sts	_Tv4main10brightnessVs5Int16+1, r31
	sts	_Tv4main10brightnessVs5Int16, r30
	ldi	r22, 1
	cp	r24, r30
	cpc	r25, r31
	brlt	LBB0_5
; BB#3:                                 ;   in Loop: Header=BB0_2 Depth=1
	andi	r22, 1
	cpi	r22, 0
	breq	LBB0_2
; BB#4:                                 ;   in Loop: Header=BB0_2 Depth=1
	lds	r22, _Tv4main10fadeAmountVs5Int16
	lds	r23, _Tv4main10fadeAmountVs5Int16+1
	cp	r24, r22
	cpc	r25, r23
	brge	LBB0_2
	rjmp	LBB0_1
LBB0_5:
	lds	r30, _Tv4main10brightnessVs5Int16
	lds	r31, _Tv4main10brightnessVs5Int16+1
	cp	r30, r20
	cpc	r31, r21
	brge	LBB0_7
; BB#6:
	ldi	r22, 0
LBB0_7:
.Lfunc_end0:
	.size	main, .Lfunc_end0-main

	.hidden	_Tv4main10brightnessVs5Int16 ; @_Tv4main10brightnessVs5Int16
	.type	_Tv4main10brightnessVs5Int16,@object
	.section	.bss,"aw",@nobits
	.globl	_Tv4main10brightnessVs5Int16
	.p2align	1
_Tv4main10brightnessVs5Int16:
	.zero	2
	.size	_Tv4main10brightnessVs5Int16, 2

	.hidden	_Tv4main10fadeAmountVs5Int16 ; @_Tv4main10fadeAmountVs5Int16
	.type	_Tv4main10fadeAmountVs5Int16,@object
	.globl	_Tv4main10fadeAmountVs5Int16
	.p2align	1
_Tv4main10fadeAmountVs5Int16:
	.zero	2
	.size	_Tv4main10fadeAmountVs5Int16, 2

	.hidden	__swift_reflection_version ; @__swift_reflection_version
	.type	__swift_reflection_version,@object
	.section	.rodata,"a",@progbits
	.weak	__swift_reflection_version
__swift_reflection_version:
	.short	1                       ; 0x1
	.size	__swift_reflection_version, 2


	; Declaring this symbol tells the CRT that it should
	;copy all variables from program memory to RAM on startup
	.globl	__do_copy_data
	; Declaring this symbol tells the CRT that it should
	;clear the zeroed data section on startup
	.globl	__do_clear_bss

I want to use this script with bugpoint to reduce the test case but I'm new to this so it will take a while to get my head round the tool.

#!/bin/sh

PATH="build/llvm/bin:$PATH"

llc -O=1 -mattr=lpm < "$@" -march=avr | FileCheck "$@"
@carlos4242
Copy link
Author

Update:

I realised the FileCheck directives in the LLVM were both too specific in some places (specifying some instructions at the start that aren't pertinent to this test case) and not general enough.

In essence the test is about whether the optimiser breaks the while loop.

You should see:

  • a main: label
  • some preamble code
  • the first label in the program, which is the start of the while loop
  • some other code (the body of the while loop)
  • an rjmp (to first label)
  • no more instructions before .Lfunc_end0

Using variables, I improved the test:

; RUN: llc -O=1 -mattr=lpm < %s -march=avr | FileCheck %s

; ModuleID = 'il-ordering-bug1-iffy8.ll'
source_filename = "il-ordering-bug1-iffy8.ll"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9"

%Vs5Int16 = type <{ i16 }>

@_Tv4main10brightnessVs5Int16 = hidden global %Vs5Int16 zeroinitializer, align 2
@_Tv4main10fadeAmountVs5Int16 = hidden global %Vs5Int16 zeroinitializer, align 2
@__swift_reflection_version = linkonce_odr hidden constant i16 1
@llvm.used = appending global [1 x i8*] [i8* bitcast (i16* @__swift_reflection_version to i8*)], section "llvm.metadata", align 8

; CHECK-LABEL: main:
define i32 @main(i32, i8**) #0 {
entry:
  %2 = bitcast i8** %1 to i8*
  ; CHECK-NOT: LBB{{[0-9]_[0-9]}}:
  store i16 0, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  store i16 5, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2

  ; the first label should occur after the global variables are initialised
  ; CHECK: [[WHILELBL:LBB[0-9]_[0-9]]]:
  br label %3

; <label>:3:                                      ; preds = %26, %entry
  br label %4

; <label>:4:                                      ; preds = %3
  %5 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %6 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
; the llvm.sadd.with.overflow function takes two int16 values
; and returns a struct with two values, the added value and a bool indicating overflow
  %7 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 %5, i16 %6)
  %8 = extractvalue { i16, i1 } %7, 0 ; final added value
  %9 = extractvalue { i16, i1 } %7, 1 ; overflow flag
; then stores it back in brightness
  store i16 %8, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2

; if (brightness<=0)
  %10 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %11 = icmp sle i16 %10, 0
  br i1 %11, label %12, label %15

; <label>:12:                                     ; preds = %4
  br label %13

; CHECK-NOT: rjmp [[WHILELBL]]{{$}}

; <label>:13:                                     ; preds = %12, %19
; load %14 with true if we got here from <label>:12:, otherwise load it with %18 if we got here from ; <label>:19:
  %14 = phi i1 [ %18, %19 ], [ true, %12 ]
  br label %20

; <label>:15:                                     ; preds = %4
  br label %16

; <label>:16:                                     ; preds = %15
; %18 = if (brightness>=255)
  %17 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %18 = icmp sge i16 %17, 255
  br label %19

; <label>:19:                                     ; preds = %16
  br label %13

; <label>:20:                                     ; preds = %13
; if (brightness<=0||brightness>=255)
  br i1 %14, label %21, label %26

; <label>:21:                                     ; preds = %20
  %22 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  %23 = icmp sgt i16 %22, 0
  br i1 %23, label %24, label %25

; <label>:24:                                     ; preds = %21
  store i16 -5, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  br label %25

; <label>:25:                                     ; preds = %24, %21
  br label %26

; CHECK: rjmp [[WHILELBL]]{{$}}
; CHECK-NEXT: .Lfunc_end0
; if (brightness>0&&brightness<255)
; <label>:26:                                     ; preds = %25, %20
  br label %3
}

; Function Attrs: nounwind readnone
declare { i16, i1 } @llvm.sadd.with.overflow.i16(i16, i16) #1

attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "target-cpu"="core2" "target-features"="+ssse3,+cx16,+fxsr,+mmx,+x87,+sse,+sse2,+sse3" }
attributes #1 = { nounwind readnone }

!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !9, !10}

!0 = !{i32 1, !"Objective-C Version", i32 2}
!1 = !{i32 1, !"Objective-C Image Info Version", i32 0}
!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
!3 = !{i32 4, !"Objective-C Garbage Collection", i32 1024}
!4 = !{i32 1, !"Objective-C Class Properties", i32 64}
!5 = !{i32 6, !"Linker Options", !6}
!6 = !{!7, !8}
!7 = !{!"-lswiftCore"}
!8 = !{!"-lobjc"}
!9 = !{i32 1, !"PIC Level", i32 2}
!10 = !{i32 1, !"Swift Version", i32 4}

This breaks at the moment and should start working when the optimiser bug is fixed.

I attempted bugpoint using these scripts...

test-compile.sh

#!/bin/sh

PATH="build/llvm/bin:$PATH"

llc -O=1 -mattr=lpm < "$@" -march=avr | FileCheck "$@" 2> /dev/null
O1RESULT=$?

llc -O=0 -mattr=lpm < "$@" -march=avr | FileCheck "$@" 2> /dev/null
O0RESULT=$?

# return a fail if "interesting", which means filecheck passes with O0 and fails with O1
# or in another way of saying it, return a success meaning "uninteresting" if either O0 fails or O1 succeeds
[ ! O0RESULT -o O1RESULT ]

bugpoint.sh

#!/bin/bash

export PATH="build/llvm/bin:$PATH"

bugpoint $1 -compile-custom -compile-command ./test-compile.sh -safe-tool-args='-march=avr -mcpu=atmega328p -filetype=obj'

Using ./bugpoint.sh test/CodeGen/AVR/il-ordering-bug1-bad.ll

This produces a bunch of files.

Normally I'd expect bugpoint-reduced-simplified.bc to have the simplified test case.

Sadly it looks a bit thin...

; ModuleID = 'bugpoint-reduced-simplified.bc'
source_filename = "bugpoint-output-f5251e3.bc"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9"

define void @main() #0 {
entry:
  br label %0

; <label>:0:                                      ; preds = %0, %entry
  br label %0
}

attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "target-cpu"="core2" "target-features"="+ssse3,+cx16,+fxsr,+mmx,+x87,+sse,+sse2,+sse3" }

!llvm.module.flags = !{!0}

!0 = !{i32 1, !"Swift Version", i32 4}

Or compiled:

	.text
	.macosx_version_min 10, 9
	.file	"bugpoint-reduced-simplified.ll"
	.globl	main
	.p2align	1
	.type	main,@function
main:                                   ; @main
; BB#0:                                 ; %entry
LBB0_1:                                 ; =>This Inner Loop Header: Depth=1
	rjmp	LBB0_1
.Lfunc_end0:
	.size	main, .Lfunc_end0-main


	; Declaring this symbol tells the CRT that it should
	;copy all variables from program memory to RAM on startup
	.globl	__do_copy_data
	; Declaring this symbol tells the CRT that it should
	;clear the zeroed data section on startup
	.globl	__do_clear_bss

Which is not exhibiting the behaviour so bugpoint has been too aggressive and has removed the code that triggered the optimiser bug.

All the other .bc files, when compiled are still not showing the buggy behaviour.

So unless anyone has a better idea, I think we cannot easily get a reduced test case and we'll have to debug with the full sized LLVM IR file here.

@carlos4242
Copy link
Author

OK so more details, I tried using opt-bisect-limit to find the faulting step. Even with it set to 0 (which should be no optimise passes), it produces mis-ordered code. I debugged that with print-after-all to see what was happening. The clue is somewhere in these 7000 lines. Any help gratefully received!...

@carlos4242
Copy link
Author

dbg-more-detail.txt

@carlos4242
Copy link
Author

By accident I forgot to specify arch = avr this time and got an interesting result...

	.section	__TEXT,__text,regular,pure_instructions
	.macosx_version_min 10, 9
	.globl	_main
	.p2align	4, 0x90
_main:                                  ## @main
	.cfi_startproc
## BB#0:                                ## %entry
	pushq	%rbp
Lcfi0:
	.cfi_def_cfa_offset 16
Lcfi1:
	.cfi_offset %rbp, -16
	movq	%rsp, %rbp
Lcfi2:
	.cfi_def_cfa_register %rbp
	movw	$0, __Tv4main10brightnessVs5Int16(%rip)
	movw	$5, __Tv4main10fadeAmountVs5Int16(%rip)
	jmp	LBB0_1
	.p2align	4, 0x90
LBB0_5:                                 ##   in Loop: Header=BB0_1 Depth=1
	movw	$-5, __Tv4main10fadeAmountVs5Int16(%rip)
LBB0_1:                                 ## =>This Inner Loop Header: Depth=1
	movzwl	__Tv4main10brightnessVs5Int16(%rip), %ecx
	addw	__Tv4main10fadeAmountVs5Int16(%rip), %cx
	movw	%cx, __Tv4main10brightnessVs5Int16(%rip)
	movb	$1, %al
	testw	%cx, %cx
	jle	LBB0_3
## BB#2:                                ##   in Loop: Header=BB0_1 Depth=1
	movswl	__Tv4main10brightnessVs5Int16(%rip), %eax
	cmpl	$254, %eax
	setg	%al
LBB0_3:                                 ##   in Loop: Header=BB0_1 Depth=1
	testb	%al, %al
	je	LBB0_1
## BB#4:                                ##   in Loop: Header=BB0_1 Depth=1
	cmpw	$0, __Tv4main10fadeAmountVs5Int16(%rip)
	jle	LBB0_1
	jmp	LBB0_5
	.cfi_endproc

	.private_extern	__Tv4main10brightnessVs5Int16 ## @_Tv4main10brightnessVs5Int16
	.globl	__Tv4main10brightnessVs5Int16
.zerofill __DATA,__common,__Tv4main10brightnessVs5Int16,2,1
	.private_extern	__Tv4main10fadeAmountVs5Int16 ## @_Tv4main10fadeAmountVs5Int16
	.globl	__Tv4main10fadeAmountVs5Int16
.zerofill __DATA,__common,__Tv4main10fadeAmountVs5Int16,2,1
	.private_extern	___swift_reflection_version ## @__swift_reflection_version
	.section	__TEXT,__const
	.globl	___swift_reflection_version
	.weak_definition	___swift_reflection_version
	.p2align	1
___swift_reflection_version:
	.short	1                       ## 0x1

	.no_dead_strip	___swift_reflection_version
	.linker_option "-lswiftCore"
	.linker_option "-lobjc"
	.section	__DATA,__objc_imageinfo,regular,no_dead_strip
L_OBJC_IMAGE_INFO:
	.long	0
	.long	1088


.subsections_via_symbols

Compare that to avr...

	.text
	.macosx_version_min 10, 9
	.file	"<stdin>"
	.globl	main
	.p2align	1
	.type	main,@function
main:                                   ; @main
; BB#0:                                 ; %entry
	ldi	r24, 0
	ldi	r25, 0
	sts	_Tv4main10brightnessVs5Int16+1, r25
	sts	_Tv4main10brightnessVs5Int16, r24
	ldi	r18, 5
	ldi	r19, 0
	sts	_Tv4main10fadeAmountVs5Int16+1, r19
	sts	_Tv4main10fadeAmountVs5Int16, r18
	ldi	r18, 251
	ldi	r19, 255
	ldi	r20, 255
	ldi	r21, 0
	rjmp	LBB0_2
LBB0_1:                                 ;   in Loop: Header=BB0_2 Depth=1
	sts	_Tv4main10fadeAmountVs5Int16+1, r19
	sts	_Tv4main10fadeAmountVs5Int16, r18
LBB0_2:                                 ; =>This Inner Loop Header: Depth=1
	lds	r22, _Tv4main10fadeAmountVs5Int16
	lds	r23, _Tv4main10fadeAmountVs5Int16+1
	lds	r30, _Tv4main10brightnessVs5Int16
	lds	r31, _Tv4main10brightnessVs5Int16+1
	add	r30, r22
	adc	r31, r23
	sts	_Tv4main10brightnessVs5Int16+1, r31
	sts	_Tv4main10brightnessVs5Int16, r30
	ldi	r22, 1
	cp	r24, r30
	cpc	r25, r31
	brlt	LBB0_5
; BB#3:                                 ;   in Loop: Header=BB0_2 Depth=1
	andi	r22, 1
	cpi	r22, 0
	breq	LBB0_2
; BB#4:                                 ;   in Loop: Header=BB0_2 Depth=1
	lds	r22, _Tv4main10fadeAmountVs5Int16
	lds	r23, _Tv4main10fadeAmountVs5Int16+1
	cp	r24, r22
	cpc	r25, r23
	brge	LBB0_2
	rjmp	LBB0_1
LBB0_5:
	lds	r30, _Tv4main10brightnessVs5Int16
	lds	r31, _Tv4main10brightnessVs5Int16+1
	cp	r30, r20
	cpc	r31, r21
	brge	LBB0_7
; BB#6:
	ldi	r22, 0
LBB0_7:
.Lfunc_end0:
	.size	main, .Lfunc_end0-main

	.hidden	_Tv4main10brightnessVs5Int16 ; @_Tv4main10brightnessVs5Int16
	.type	_Tv4main10brightnessVs5Int16,@object
	.section	.bss,"aw",@nobits
	.globl	_Tv4main10brightnessVs5Int16
	.p2align	1
_Tv4main10brightnessVs5Int16:
	.zero	2
	.size	_Tv4main10brightnessVs5Int16, 2

	.hidden	_Tv4main10fadeAmountVs5Int16 ; @_Tv4main10fadeAmountVs5Int16
	.type	_Tv4main10fadeAmountVs5Int16,@object
	.globl	_Tv4main10fadeAmountVs5Int16
	.p2align	1
_Tv4main10fadeAmountVs5Int16:
	.zero	2
	.size	_Tv4main10fadeAmountVs5Int16, 2

	.hidden	__swift_reflection_version ; @__swift_reflection_version
	.type	__swift_reflection_version,@object
	.section	.rodata,"a",@progbits
	.weak	__swift_reflection_version
__swift_reflection_version:
	.short	1                       ; 0x1
	.size	__swift_reflection_version, 2


	; Declaring this symbol tells the CRT that it should
	;copy all variables from program memory to RAM on startup
	.globl	__do_copy_data
	; Declaring this symbol tells the CRT that it should
	;clear the zeroed data section on startup
	.globl	__do_clear_bss

you can see that the order of instructions looks sane on the other architecture (arm?) but is broken on AVR. So it's definitely something about our transformations that's breaking it.

@carlos4242
Copy link
Author

carlos4242 commented Nov 26, 2017

It's definitely a reordering thing that is being introduced just with assembly printed for this backend.

Look at this investigation...


  .globl  _main
  .p2align  4, 0x90
_main:                                  ## @main
  .cfi_startproc
## BB#0:                                ## %entry
  pushq %rbp
Lcfi0:
  .cfi_def_cfa_offset 16
Lcfi1:
  .cfi_offset %rbp, -16
  movq  %rsp, %rbp
Lcfi2:
  .cfi_def_cfa_register %rbp
  movw  $0, __Tv4main10brightnessVs5Int16(%rip)
  movw  $5, __Tv4main10fadeAmountVs5Int16(%rip)
  jmp LBB0_1
  .p2align  4, 0x90
LBB0_5:                                 ##   in Loop: Header=BB0_1 Depth=1
  movw  $-5, __Tv4main10fadeAmountVs5Int16(%rip)
LBB0_1:                                 ## =>This Inner Loop Header: Depth=1
  movzwl  __Tv4main10brightnessVs5Int16(%rip), %ecx
  addw  __Tv4main10fadeAmountVs5Int16(%rip), %cx
  movw  %cx, __Tv4main10brightnessVs5Int16(%rip)
  movb  $1, %al
  testw %cx, %cx
  jle LBB0_3
## BB#2:                                ##   in Loop: Header=BB0_1 Depth=1
  movswl  __Tv4main10brightnessVs5Int16(%rip), %eax
  cmpl  $254, %eax
  setg  %al
LBB0_3:                                 ##   in Loop: Header=BB0_1 Depth=1
  testb %al, %al
  je  LBB0_1
## BB#4:                                ##   in Loop: Header=BB0_1 Depth=1
  cmpw  $0, __Tv4main10fadeAmountVs5Int16(%rip)
  jle LBB0_1
  jmp LBB0_5
  .cfi_endproc



***BAD***

  .globl  main
  .p2align  1
  .type main,@function
main:                                   ; @main
; BB#0:                                 ; %entry
  ldi r24, 0
  ldi r25, 0
  sts _Tv4main10brightnessVs5Int16+1, r25
  sts _Tv4main10brightnessVs5Int16, r24
  ldi r18, 5
  ldi r19, 0
  sts _Tv4main10fadeAmountVs5Int16+1, r19
  sts _Tv4main10fadeAmountVs5Int16, r18
  ldi r18, 251
  ldi r19, 255
  ldi r20, 255
  ldi r21, 0
  rjmp  LBB0_2
LBB0_1:                                 ;   in Loop: Header=BB0_2 Depth=1
  sts _Tv4main10fadeAmountVs5Int16+1, r19
  sts _Tv4main10fadeAmountVs5Int16, r18
LBB0_2:                                 ; =>This Inner Loop Header: Depth=1
  lds r22, _Tv4main10fadeAmountVs5Int16
  lds r23, _Tv4main10fadeAmountVs5Int16+1
  lds r30, _Tv4main10brightnessVs5Int16
  lds r31, _Tv4main10brightnessVs5Int16+1
  add r30, r22
  adc r31, r23
  sts _Tv4main10brightnessVs5Int16+1, r31
  sts _Tv4main10brightnessVs5Int16, r30
  ldi r22, 1 ... should be using a different register for the logic test, this register is stomped
  cp  r24, r30
  cpc r25, r31
  brlt  LBB0_5
; BB#3:                                 ;   in Loop: Header=BB0_2 Depth=1
  andi  r22, 1
  cpi r22, 0
  breq  LBB0_2
; BB#4:                                 ;   in Loop: Header=BB0_2 Depth=1
  lds r22, _Tv4main10fadeAmountVs5Int16
  lds r23, _Tv4main10fadeAmountVs5Int16+1
  cp  r24, r22
  cpc r25, r23
  brge  LBB0_2
  rjmp  LBB0_1
LBB0_5:
  lds r30, _Tv4main10brightnessVs5Int16
  lds r31, _Tv4main10brightnessVs5Int16+1
  cp  r30, r20
  cpc r31, r21
  brge  LBB0_7
; BB#6:
  ldi r22, 0
LBB0_7:
.Lfunc_end0:
  .size main, .Lfunc_end0-main




  ***REPAIRED***

  .globl  main
  .p2align  1
  .type main,@function
main:                                   ; @main
; BB#0:                                 ; %entry
  ldi r24, 0
  ldi r25, 0
  sts _Tv4main10brightnessVs5Int16+1, r25
  sts _Tv4main10brightnessVs5Int16, r24
  ldi r18, 5
  ldi r19, 0
  sts _Tv4main10fadeAmountVs5Int16+1, r19
  sts _Tv4main10fadeAmountVs5Int16, r18
  ldi r18, 251
  ldi r19, 255
  ldi r20, 255
  ldi r21, 0
  rjmp  LBB0_2
LBB0_1:                                 ;   in Loop: Header=BB0_2 Depth=1
  sts _Tv4main10fadeAmountVs5Int16+1, r19
  sts _Tv4main10fadeAmountVs5Int16, r18
LBB0_2:                                 ; =>This Inner Loop Header: Depth=1
  lds r22, _Tv4main10fadeAmountVs5Int16
  lds r23, _Tv4main10fadeAmountVs5Int16+1
  lds r30, _Tv4main10brightnessVs5Int16
  lds r31, _Tv4main10brightnessVs5Int16+1
  add r30, r22
  adc r31, r23
  sts _Tv4main10brightnessVs5Int16+1, r31
  sts _Tv4main10brightnessVs5Int16, r30
  ldi r22, 1 ... should be using a different register for the logic test, this register is stomped
  cp  r24, r30
  cpc r25, r31
  brlt  LBB0_5
; BB#3:                                 ;   in Loop: Header=BB0_2 Depth=1
  lds r30, _Tv4main10brightnessVs5Int16
  lds r31, _Tv4main10brightnessVs5Int16+1
  cp  r30, r20
  cpc r31, r21
  brge  LBB0_5
; BB#4:                                 ;   in Loop: Header=BB0_2 Depth=1
  ldi r22, 0
LBB0_5:
  andi  r22, 1
  cpi r22, 0
  breq  LBB0_2
; BB#6:                                 ;   in Loop: Header=BB0_2 Depth=1
  lds r22, _Tv4main10fadeAmountVs5Int16
  lds r23, _Tv4main10fadeAmountVs5Int16+1
  cp  r24, r22
  cpc r25, r23
  brge  LBB0_2
  rjmp  LBB0_1
.Lfunc_end0:
  .size main, .Lfunc_end0-main

The first ("GOOD") is the output from another architecture for exactly the same input LLVM IR, the second ("BAD") is the output from avr for this LLVM IR. The third ("REPAIRED") is my suggestion for how to fix this in this one example.

It looks like the order of machine basic blocks has gone wrong. If you move the MBBs represented by the code between labels LBB0_5: and LBB0_7: up before the comment ; BB#3: (presumably the start of the fourth machine basic block) then it all works and it then exactly mirrors the structure of the arm(?) code from the "GOOD" compile.

@carlos4242
Copy link
Author

As a further note, I looked at the source LLVM IR...

define i32 @main(i32, i8**) #0 {
entry:
  %2 = bitcast i8** %1 to i8*
  store i16 0, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  store i16 5, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  br label %3

; <label>:3:                                      ; preds = %26, %entry
  br label %4

; <label>:4:                                      ; preds = %3
  %5 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %6 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
; the llvm.sadd.with.overflow function takes two int16 values
; and returns a struct with two values, the added value and a bool indicating overflow
  %7 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 %5, i16 %6)
  %8 = extractvalue { i16, i1 } %7, 0 ; final added value
  %9 = extractvalue { i16, i1 } %7, 1 ; overflow flag
; then stores it back in brightness
  store i16 %8, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2

; if (brightness<=0)
  %10 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %11 = icmp sle i16 %10, 0
  br i1 %11, label %12, label %15

; <label>:12:                                     ; preds = %4
  br label %13

; CHECK-NOT: rjmp [[WHILELBL]]{{$}}

; <label>:13:                                     ; preds = %12, %19
; load %14 with true if we got here from <label>:12:, otherwise load it with %18 if we got here from ; <label>:19:
  %14 = phi i1 [ %18, %19 ], [ true, %12 ]
  br label %20

; <label>:15:                                     ; preds = %4
  br label %16

; <label>:16:                                     ; preds = %15
; %18 = if (brightness>=255)
  %17 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %18 = icmp sge i16 %17, 255
  br label %19

; <label>:19:                                     ; preds = %16
  br label %13

; <label>:20:                                     ; preds = %13
; if (brightness<=0||brightness>=255)
  br i1 %14, label %21, label %26

; <label>:21:                                     ; preds = %20
  %22 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  %23 = icmp sgt i16 %22, 0
  br i1 %23, label %24, label %25

; <label>:24:                                     ; preds = %21
  store i16 -5, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  br label %25

; <label>:25:                                     ; preds = %24, %21
  br label %26

; CHECK: rjmp [[WHILELBL]]{{$}}
; CHECK-NEXT: .Lfunc_end0
; if (brightness>0&&brightness<255)
; <label>:26:                                     ; preds = %25, %20
  br label %3
}

... this straighforwardly rearranges to...

define i32 @main(i32, i8**) #0 {
entry:
  %2 = bitcast i8** %1 to i8*
  store i16 0, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  store i16 5, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  br label %3

; <label>:24:                                     ; preds = %21
  store i16 -5, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2

; <label>:3:                                      ; preds = %20, %21, %entry
  %5 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %6 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  %7 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 %5, i16 %6)
  %8 = extractvalue { i16, i1 } %7, 0 ; final added value
  %9 = extractvalue { i16, i1 } %7, 1 ; overflow flag
  store i16 %8, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %10 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %11 = icmp sle i16 %10, 0
  br i1 %11, label %13, label %16

; <label>:13:                                     ; preds = %3, %16
  %14 = phi i1 [ %18, %16 ], [ true, %4 ]
  br label %20

; <label>:16:                                     ; preds = %3
  %17 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %18 = icmp sge i16 %17, 255
  br label %13

; <label>:20:                                     ; preds = %13
  br i1 %14, label %21, label %3

; <label>:21:                                     ; preds = %20
  %22 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  %23 = icmp sgt i16 %22, 0
  br i1 %23, label %24, label %3
}

...which gives some clues as to what might have gone wrong as it mirrors the wrong assembly.

@carlos4242
Copy link
Author

Apple looked at it. Their simple solution was to run the llvm ir through opt first. It worked!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant