Skip to content

The order of assembly code output from attached IL is broken when optimisation is O1 or greater #83

Closed
@carlos4242

Description

@carlos4242

Working from this minimal example, I've worked out Swift code that ends up broken when compiled into AVR assembly, if and only if O1 or greater.

var brightness: Int16 = 0
var fadeAmount: Int16 = 5

while(true) {
  brightness = brightness &+ fadeAmount
  if (brightness <= 0 || brightness >= 255) {
    if fadeAmount > 0 {
      fadeAmount = -5
    }
  }
}

This produces the following LLVM IR when run through the swiftc in Xcode 8.3.3 (swiftc version Apple Swift version 3.1 (swiftlang-802.0.53 clang-802.0.42)).

(Note I've amended this LLVM IR with comments and with test labels so it makes a valid test in llvm-lit if placed in test/CodeGen/AVR.)

; RUN: llc -O=1 -mattr=lpm < %s -march=avr | FileCheck %s

; ModuleID = 'il-ordering-bug1-iffy8.ll'
source_filename = "il-ordering-bug1-iffy8.ll"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9"

%Vs5Int16 = type <{ i16 }>

@_Tv4main10brightnessVs5Int16 = hidden global %Vs5Int16 zeroinitializer, align 2
@_Tv4main10fadeAmountVs5Int16 = hidden global %Vs5Int16 zeroinitializer, align 2
@__swift_reflection_version = linkonce_odr hidden constant i16 1
@llvm.used = appending global [1 x i8*] [i8* bitcast (i16* @__swift_reflection_version to i8*)], section "llvm.metadata", align 8

; CHECK-LABEL: main:
define i32 @main(i32, i8**) #0 {
entry:
  %2 = bitcast i8** %1 to i8*
  ; CHECK:      ldi r24, 0
  ; CHECK-NEXT: ldi r25, 0
  ; CHECK-NEXT: sts _Tv4main10brightnessVs5Int16+1, r25
  ; CHECK-NEXT: sts _Tv4main10brightnessVs5Int16, r24
  store i16 0, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  store i16 5, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2

  ; the first label should occur after the global variables are initialised
  ; CHECK-LABEL: LBB0_1:
  br label %3

; <label>:3:                                      ; preds = %26, %entry
  br label %4

; <label>:4:                                      ; preds = %3
  %5 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %6 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
; the llvm.sadd.with.overflow function takes two int16 values
; and returns a struct with two values, the added value and a bool indicating overflow
  %7 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 %5, i16 %6)
  %8 = extractvalue { i16, i1 } %7, 0 ; final added value
  %9 = extractvalue { i16, i1 } %7, 1 ; overflow flag
; then stores it back in brightness
  store i16 %8, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2

; if (brightness<=0)
  %10 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %11 = icmp sle i16 %10, 0
  br i1 %11, label %12, label %15

; <label>:12:                                     ; preds = %4
  br label %13

; <label>:13:                                     ; preds = %12, %19
; load %14 with true if we got here from <label>:12:, otherwise load it with %18 if we got here from ; <label>:19:
  %14 = phi i1 [ %18, %19 ], [ true, %12 ]
  br label %20

; <label>:15:                                     ; preds = %4
  br label %16

; <label>:16:                                     ; preds = %15
; %18 = if (brightness>=255)
  %17 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10brightnessVs5Int16, i32 0, i32 0), align 2
  %18 = icmp sge i16 %17, 255
  br label %19

; <label>:19:                                     ; preds = %16
  br label %13

; <label>:20:                                     ; preds = %13
; if (brightness<=0||brightness>=255)
  br i1 %14, label %21, label %26

; <label>:21:                                     ; preds = %20
  %22 = load i16, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  %23 = icmp sgt i16 %22, 0
  br i1 %23, label %24, label %25

; <label>:24:                                     ; preds = %21
  store i16 -5, i16* getelementptr inbounds (%Vs5Int16, %Vs5Int16* @_Tv4main10fadeAmountVs5Int16, i32 0, i32 0), align 2
  br label %25

; <label>:25:                                     ; preds = %24, %21
  br label %26

; CHECK: rjmp LBB0_1{{$}}
; CHECK-NEXT: .Lfunc_end0
; if (brightness>0&&brightness<255)
; <label>:26:                                     ; preds = %25, %20
  br label %3
}

; Function Attrs: nounwind readnone
declare { i16, i1 } @llvm.sadd.with.overflow.i16(i16, i16) #1

attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "target-cpu"="core2" "target-features"="+ssse3,+cx16,+fxsr,+mmx,+x87,+sse,+sse2,+sse3" }
attributes #1 = { nounwind readnone }

!llvm.module.flags = !{!0, !1, !2, !3, !4, !5, !9, !10}

!0 = !{i32 1, !"Objective-C Version", i32 2}
!1 = !{i32 1, !"Objective-C Image Info Version", i32 0}
!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
!3 = !{i32 4, !"Objective-C Garbage Collection", i32 1024}
!4 = !{i32 1, !"Objective-C Class Properties", i32 64}
!5 = !{i32 6, !"Linker Options", !6}
!6 = !{!7, !8}
!7 = !{!"-lswiftCore"}
!8 = !{!"-lobjc"}
!9 = !{i32 1, !"PIC Level", i32 2}
!10 = !{i32 1, !"Swift Version", i32 4}

When compiled with -O=0, this produces very cumbersome but valid assembly:

	.text
	.macosx_version_min 10, 9
	.file	"<stdin>"
	.globl	main
	.p2align	1
	.type	main,@function
main:                                   ; @main
; BB#0:                                 ; %entry
	push	r28
	push	r29
	in	r28, 61
	in	r29, 62
	sbiw	r28, 7
	in	r0, 63
	cli
	out	62, r29
	out	63, r0
	out	61, r28
	ldi	r24, 0
	ldi	r25, 0
	sts	_Tv4main10brightnessVs5Int16+1, r25
	sts	_Tv4main10brightnessVs5Int16, r24
	ldi	r24, 5
	ldi	r25, 0
	sts	_Tv4main10fadeAmountVs5Int16+1, r25
	sts	_Tv4main10fadeAmountVs5Int16, r24
	std	Y+6, r20                ; 2-byte Folded Spill
	std	Y+7, r21                ; 2-byte Folded Spill
	rjmp	LBB0_1
LBB0_1:                                 ; =>This Inner Loop Header: Depth=1
	rjmp	LBB0_2
LBB0_2:                                 ;   in Loop: Header=BB0_1 Depth=1
	lds	r24, _Tv4main10brightnessVs5Int16
	lds	r25, _Tv4main10brightnessVs5Int16+1
	lds	r18, _Tv4main10fadeAmountVs5Int16
	lds	r19, _Tv4main10fadeAmountVs5Int16+1
	add	r24, r18
	adc	r25, r19
	sts	_Tv4main10brightnessVs5Int16+1, r25
	sts	_Tv4main10brightnessVs5Int16, r24
	lds	r24, _Tv4main10brightnessVs5Int16
	lds	r25, _Tv4main10brightnessVs5Int16+1
	ldi	r18, 0
	ldi	r19, 0
	cp	r18, r24
	cpc	r19, r25
	brlt	LBB0_5
	rjmp	LBB0_3
LBB0_3:                                 ;   in Loop: Header=BB0_1 Depth=1
	ldi	r24, 1
	std	Y+5, r24                ; 1-byte Folded Spill
	rjmp	LBB0_4
LBB0_4:                                 ;   in Loop: Header=BB0_1 Depth=1
	ldd	r24, Y+5                ; 1-byte Folded Reload
	std	Y+4, r24                ; 1-byte Folded Spill
	rjmp	LBB0_10
LBB0_5:                                 ;   in Loop: Header=BB0_1 Depth=1
	rjmp	LBB0_6
LBB0_6:                                 ;   in Loop: Header=BB0_1 Depth=1
	lds	r24, _Tv4main10brightnessVs5Int16
	lds	r25, _Tv4main10brightnessVs5Int16+1
	ldi	r18, 255
	ldi	r19, 0
	ldi	r20, 0
	ldi	r21, 1
	cp	r24, r18
	cpc	r25, r19
	std	Y+3, r20                ; 1-byte Folded Spill
	std	Y+2, r21                ; 1-byte Folded Spill
	brge	LBB0_7
	rjmp	LBB0_8
LBB0_7:                                 ;   in Loop: Header=BB0_1 Depth=1
	ldd	r24, Y+2                ; 1-byte Folded Reload
	std	Y+1, r24                ; 1-byte Folded Spill
	rjmp	LBB0_9
LBB0_8:                                 ;   in Loop: Header=BB0_1 Depth=1
	ldd	r24, Y+3                ; 1-byte Folded Reload
	std	Y+2, r24                ; 1-byte Folded Spill
	rjmp	LBB0_7
LBB0_9:                                 ;   in Loop: Header=BB0_1 Depth=1
	ldd	r24, Y+1                ; 1-byte Folded Reload
	std	Y+5, r24                ; 1-byte Folded Spill
	rjmp	LBB0_4
LBB0_10:                                ;   in Loop: Header=BB0_1 Depth=1
	ldd	r24, Y+4                ; 1-byte Folded Reload
	andi	r24, 1
	cpi	r24, 0
	breq	LBB0_14
	rjmp	LBB0_11
LBB0_11:                                ;   in Loop: Header=BB0_1 Depth=1
	lds	r24, _Tv4main10fadeAmountVs5Int16
	lds	r25, _Tv4main10fadeAmountVs5Int16+1
	ldi	r18, 0
	ldi	r19, 0
	cp	r18, r24
	cpc	r19, r25
	brge	LBB0_13
	rjmp	LBB0_12
LBB0_12:                                ;   in Loop: Header=BB0_1 Depth=1
	ldi	r24, 251
	ldi	r25, 255
	sts	_Tv4main10fadeAmountVs5Int16+1, r25
	sts	_Tv4main10fadeAmountVs5Int16, r24
	rjmp	LBB0_13
LBB0_13:                                ;   in Loop: Header=BB0_1 Depth=1
	rjmp	LBB0_14
LBB0_14:                                ;   in Loop: Header=BB0_1 Depth=1
	rjmp	LBB0_1
.Lfunc_end0:
	.size	main, .Lfunc_end0-main

	.hidden	_Tv4main10brightnessVs5Int16 ; @_Tv4main10brightnessVs5Int16
	.type	_Tv4main10brightnessVs5Int16,@object
	.section	.bss,"aw",@nobits
	.globl	_Tv4main10brightnessVs5Int16
	.p2align	1
_Tv4main10brightnessVs5Int16:
	.zero	2
	.size	_Tv4main10brightnessVs5Int16, 2

	.hidden	_Tv4main10fadeAmountVs5Int16 ; @_Tv4main10fadeAmountVs5Int16
	.type	_Tv4main10fadeAmountVs5Int16,@object
	.globl	_Tv4main10fadeAmountVs5Int16
	.p2align	1
_Tv4main10fadeAmountVs5Int16:
	.zero	2
	.size	_Tv4main10fadeAmountVs5Int16, 2

	.hidden	__swift_reflection_version ; @__swift_reflection_version
	.type	__swift_reflection_version,@object
	.section	.rodata,"a",@progbits
	.weak	__swift_reflection_version
__swift_reflection_version:
	.short	1                       ; 0x1
	.size	__swift_reflection_version, 2


	; Declaring this symbol tells the CRT that it should
	;copy all variables from program memory to RAM on startup
	.globl	__do_copy_data
	; Declaring this symbol tells the CRT that it should
	;clear the zeroed data section on startup
	.globl	__do_clear_bss

When compiled with -O=1 we get this assembly, which is much more efficient but doesn't work. In particular the while (true) {} loop is effectively broken.

The rjmp jump back to LBB0_1 should (probably?) be the last instruction. Instead there's another couple of labels after that. Looks like some reordering went wrong during optimisation?

	.text
	.macosx_version_min 10, 9
	.file	"<stdin>"
	.globl	main
	.p2align	1
	.type	main,@function
main:                                   ; @main
; BB#0:                                 ; %entry
	ldi	r24, 0
	ldi	r25, 0
	sts	_Tv4main10brightnessVs5Int16+1, r25
	sts	_Tv4main10brightnessVs5Int16, r24
	ldi	r18, 5
	ldi	r19, 0
	sts	_Tv4main10fadeAmountVs5Int16+1, r19
	sts	_Tv4main10fadeAmountVs5Int16, r18
	ldi	r18, 251
	ldi	r19, 255
	ldi	r20, 255
	ldi	r21, 0
	rjmp	LBB0_2
LBB0_1:                                 ;   in Loop: Header=BB0_2 Depth=1
	sts	_Tv4main10fadeAmountVs5Int16+1, r19
	sts	_Tv4main10fadeAmountVs5Int16, r18
LBB0_2:                                 ; =>This Inner Loop Header: Depth=1
	lds	r22, _Tv4main10fadeAmountVs5Int16
	lds	r23, _Tv4main10fadeAmountVs5Int16+1
	lds	r30, _Tv4main10brightnessVs5Int16
	lds	r31, _Tv4main10brightnessVs5Int16+1
	add	r30, r22
	adc	r31, r23
	sts	_Tv4main10brightnessVs5Int16+1, r31
	sts	_Tv4main10brightnessVs5Int16, r30
	ldi	r22, 1
	cp	r24, r30
	cpc	r25, r31
	brlt	LBB0_5
; BB#3:                                 ;   in Loop: Header=BB0_2 Depth=1
	andi	r22, 1
	cpi	r22, 0
	breq	LBB0_2
; BB#4:                                 ;   in Loop: Header=BB0_2 Depth=1
	lds	r22, _Tv4main10fadeAmountVs5Int16
	lds	r23, _Tv4main10fadeAmountVs5Int16+1
	cp	r24, r22
	cpc	r25, r23
	brge	LBB0_2
	rjmp	LBB0_1
LBB0_5:
	lds	r30, _Tv4main10brightnessVs5Int16
	lds	r31, _Tv4main10brightnessVs5Int16+1
	cp	r30, r20
	cpc	r31, r21
	brge	LBB0_7
; BB#6:
	ldi	r22, 0
LBB0_7:
.Lfunc_end0:
	.size	main, .Lfunc_end0-main

	.hidden	_Tv4main10brightnessVs5Int16 ; @_Tv4main10brightnessVs5Int16
	.type	_Tv4main10brightnessVs5Int16,@object
	.section	.bss,"aw",@nobits
	.globl	_Tv4main10brightnessVs5Int16
	.p2align	1
_Tv4main10brightnessVs5Int16:
	.zero	2
	.size	_Tv4main10brightnessVs5Int16, 2

	.hidden	_Tv4main10fadeAmountVs5Int16 ; @_Tv4main10fadeAmountVs5Int16
	.type	_Tv4main10fadeAmountVs5Int16,@object
	.globl	_Tv4main10fadeAmountVs5Int16
	.p2align	1
_Tv4main10fadeAmountVs5Int16:
	.zero	2
	.size	_Tv4main10fadeAmountVs5Int16, 2

	.hidden	__swift_reflection_version ; @__swift_reflection_version
	.type	__swift_reflection_version,@object
	.section	.rodata,"a",@progbits
	.weak	__swift_reflection_version
__swift_reflection_version:
	.short	1                       ; 0x1
	.size	__swift_reflection_version, 2


	; Declaring this symbol tells the CRT that it should
	;copy all variables from program memory to RAM on startup
	.globl	__do_copy_data
	; Declaring this symbol tells the CRT that it should
	;clear the zeroed data section on startup
	.globl	__do_clear_bss

I want to use this script with bugpoint to reduce the test case but I'm new to this so it will take a while to get my head round the tool.

#!/bin/sh

PATH="build/llvm/bin:$PATH"

llc -O=1 -mattr=lpm < "$@" -march=avr | FileCheck "$@"

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions