[x86] use zero-extending load of a byte outside of loops too (2nd try)

The first attempt missed changing test files for tools (update_llc_test_checks.py). Original commit message: This implements the main suggested change from issue #56498. Using the shorter (non-extending) instruction with only -Oz ("minsize") rather than -Os ("optsize") is left as a possible follow-up. As noted in the bug report, the zero-extending load may have shorter latency/better throughput across a wide range of x86 micro-arches, and it avoids a potential false dependency. The cost is an extra instruction byte. This could cause perf ups and downs from secondary effects, but I don't think it is possible to account for those in advance, and that will likely also depend on exact micro-arch. This does bring LLVM x86 codegen more in line with existing gcc codegen, so if problems are exposed they are more likely to occur for both compilers. Differential Revision: https://reviews.llvm.org/D129775
llvm · Jul 20, 2022 · f0dd12e · f0dd12e
1 parent 2d889a8
commit f0dd12e
Show file tree

Hide file tree

Showing 211 changed files with 3,834 additions and 3,292 deletions.
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -393,12 +393,12 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
   switch (MI->getOpcode()) {
 
   case X86::MOV8rm:
-    // Only replace 8 bit loads with the zero extending versions if
-    // in an inner most loop and not optimizing for size. This takes
-    // an extra byte to encode, and provides limited performance upside.
-    if (MachineLoop *ML = MLI->getLoopFor(&MBB))
-      if (ML->begin() == ML->end() && !OptForSize)
-        return tryReplaceLoad(X86::MOVZX32rm8, MI);
+    // Replace 8-bit loads with the zero-extending version if not optimizing
+    // for size. The extending op is cheaper across a wide range of uarch and
+    // it avoids a potentially expensive partial register stall. It takes an
+    // extra byte to encode, however, so don't do this when optimizing for size.
+    if (!OptForSize)
+      return tryReplaceLoad(X86::MOVZX32rm8, MI);
     break;
 
   case X86::MOV16rm:

diff --git a/llvm/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll b/llvm/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
@@ -11,7 +11,7 @@ target triple = "i686-unknown-unknown"
 define i32 @test5(i32 %B, i8 %C) {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    movl A, %eax
 ; CHECK-NEXT:    shldl %cl, %edx, %eax

diff --git a/llvm/test/CodeGen/X86/2006-05-08-InstrSched.ll b/llvm/test/CodeGen/X86/2006-05-08-InstrSched.ll
@@ -10,7 +10,7 @@ define void @test() {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl A, %eax
 ; CHECK-NEXT:    movzwl 2(%eax), %eax
-; CHECK-NEXT:    movb B, %cl
+; CHECK-NEXT:    movzbl B, %ecx
 ; CHECK-NEXT:    movl C, %edx
 ; CHECK-NEXT:    andb $16, %cl
 ; CHECK-NEXT:    shll %cl, %edx

diff --git a/llvm/test/CodeGen/X86/2006-11-17-IllegalMove.ll b/llvm/test/CodeGen/X86/2006-11-17-IllegalMove.ll
@@ -9,8 +9,8 @@ define void @handle_vector_size_attribute() nounwind {
 ; CHECK-NEXT:    cmpl $1, %eax
 ; CHECK-NEXT:    ja .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %bb77
-; CHECK-NEXT:    movb 0, %al
-; CHECK-NEXT:    movb 0, %al
+; CHECK-NEXT:    movzbl 0, %eax
+; CHECK-NEXT:    movzbl 0, %eax
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:  .LBB0_2: # %bb84

diff --git a/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll b/llvm/test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll
@@ -69,11 +69,11 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
 ; CHECK-NEXT:    movq _PyUFunc_API@GOTPCREL(%rip), %rbp
 ; CHECK-NEXT:    movq (%rbp), %rax
 ; CHECK-NEXT:    callq *216(%rax)
-; CHECK-NEXT:    movb {{[0-9]+}}(%rsp), %dl
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    testb %dl, %dl
 ; CHECK-NEXT:    je LBB0_11
 ; CHECK-NEXT:  ## %bb.7: ## %cond_false.i
-; CHECK-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-NEXT:    movzbl %bl, %ecx
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    divb %dl
@@ -98,8 +98,8 @@ define ptr @ubyte_divmod(ptr %a, ptr %b) {
 ; CHECK-NEXT:  LBB0_11: ## %cond_true.i
 ; CHECK-NEXT:    movl $4, %edi
 ; CHECK-NEXT:    callq _feraiseexcept
-; CHECK-NEXT:    movb {{[0-9]+}}(%rsp), %dl
-; CHECK-NEXT:    movb {{[0-9]+}}(%rsp), %bl
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-NEXT:    xorl %r14d, %r14d
 ; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    je LBB0_14

diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -31,7 +31,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    .cfi_offset %ebx, -12
 ; CHECK-NEXT:    .cfi_offset %ebp, -8
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; CHECK-NEXT:    testb $1, %bl
 ; CHECK-NEXT:    je LBB0_25
 ; CHECK-NEXT:  ## %bb.1: ## %bb116.i

diff --git a/llvm/test/CodeGen/X86/2008-04-24-MemCpyBug.ll b/llvm/test/CodeGen/X86/2008-04-24-MemCpyBug.ll
@@ -17,7 +17,7 @@ define void @testit63_entry_2E_ce() nounwind  {
 ; CHECK-NEXT:    movl %esp, %edi
 ; CHECK-NEXT:    movl $g1s63, %esi
 ; CHECK-NEXT:    rep;movsl (%esi), %es:(%edi)
-; CHECK-NEXT:    movb g1s63+62, %al
+; CHECK-NEXT:    movzbl g1s63+62, %eax
 ; CHECK-NEXT:    movb %al, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movzwl g1s63+60, %eax
 ; CHECK-NEXT:    movw %ax, {{[0-9]+}}(%esp)

diff --git a/llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll b/llvm/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
@@ -15,7 +15,7 @@ define i32 @func_44(i16 signext %p_46) nounwind {
 ; SOURCE-SCHED-NEXT:    xorl %ecx, %ecx
 ; SOURCE-SCHED-NEXT:    cmpl $2, %eax
 ; SOURCE-SCHED-NEXT:    setge %cl
-; SOURCE-SCHED-NEXT:    movb g_73, %dl
+; SOURCE-SCHED-NEXT:    movzbl g_73, %edx
 ; SOURCE-SCHED-NEXT:    xorl %eax, %eax
 ; SOURCE-SCHED-NEXT:    subb {{[0-9]+}}(%esp), %al
 ; SOURCE-SCHED-NEXT:    testb %dl, %dl

diff --git a/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/llvm/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
@@ -1,10 +1,44 @@
-; RUN: llc < %s -mcpu=core2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.4"
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
 
 define fastcc i32 @cli_magic_scandesc(ptr %in) nounwind ssp {
+; CHECK-LABEL: cli_magic_scandesc:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    movq __stack_chk_guard(%rip), %rax
+; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movzbl (%rsp), %eax
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movq (%rdi), %rdx
+; CHECK-NEXT:    movq 8(%rdi), %rsi
+; CHECK-NEXT:    movq %rdx, (%rsp)
+; CHECK-NEXT:    movq 24(%rdi), %rdx
+; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq 16(%rdi), %rdx
+; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq 32(%rdi), %rdx
+; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq 40(%rdi), %rdx
+; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq 48(%rdi), %rdx
+; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq 56(%rdi), %rdx
+; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %al, (%rsp)
+; CHECK-NEXT:    movb %cl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq __stack_chk_guard(%rip), %rax
+; CHECK-NEXT:    cmpq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_2: # %entry
+; CHECK-NEXT:    callq __stack_chk_fail@PLT
 entry:
   %a = alloca [64 x i8]
   %c = getelementptr inbounds [64 x i8], ptr %a, i64 0, i32 30
@@ -15,10 +49,3 @@ entry:
   store i8 %e, ptr %c, align 8
   ret i32 0
 }
-
-; CHECK: movq	___stack_chk_guard@GOTPCREL(%rip)
-; CHECK: movb   (%rsp), [[R1:%.+]]
-; CHECK: movb   30(%rsp), [[R0:%.+]]
-; CHECK: movb   [[R1]], (%rsp)
-; CHECK: movb   [[R0]], 30(%rsp)
-; CHECK: callq	___stack_chk_fail
diff --git a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
@@ -233,7 +233,7 @@ define i8 @neg_type_mismatch(i32 %a1_wide_orig, i16 %a2_wide_orig, i32 %inc) nou
 define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounwind {
 ; I386-NOCMOV-LABEL: negative_CopyFromReg:
 ; I386-NOCMOV:       # %bb.0:
-; I386-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %al
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; I386-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
@@ -255,7 +255,7 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 ;
 ; I686-NOCMOV-LABEL: negative_CopyFromReg:
 ; I686-NOCMOV:       # %bb.0:
-; I686-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %al
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; I686-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
@@ -297,8 +297,8 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 define i8 @negative_CopyFromRegs(i32 %a1_wide, i32 %a2_wide) nounwind {
 ; I386-NOCMOV-LABEL: negative_CopyFromRegs:
 ; I386-NOCMOV:       # %bb.0:
-; I386-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; I386-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %al
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB4_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
@@ -317,8 +317,8 @@ define i8 @negative_CopyFromRegs(i32 %a1_wide, i32 %a2_wide) nounwind {
 ;
 ; I686-NOCMOV-LABEL: negative_CopyFromRegs:
 ; I686-NOCMOV:       # %bb.0:
-; I686-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; I686-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %al
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB4_2
 ; I686-NOCMOV-NEXT:  # %bb.1:

diff --git a/llvm/test/CodeGen/X86/GlobalISel/callingconv.ll b/llvm/test/CodeGen/X86/GlobalISel/callingconv.ll
@@ -324,7 +324,7 @@ define void @test_abi_exts_call(ptr %addr) {
 ; X32-NEXT:    .cfi_offset %esi, -12
 ; X32-NEXT:    .cfi_offset %ebx, -8
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movb (%eax), %bl
+; X32-NEXT:    movzbl (%eax), %ebx
 ; X32-NEXT:    movzbl %bl, %esi
 ; X32-NEXT:    movl %esi, (%esp)
 ; X32-NEXT:    calll take_char
@@ -346,7 +346,7 @@ define void @test_abi_exts_call(ptr %addr) {
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    .cfi_offset %rbx, -16
-; X64-NEXT:    movb (%rdi), %al
+; X64-NEXT:    movzbl (%rdi), %eax
 ; X64-NEXT:    movzbl %al, %ebx
 ; X64-NEXT:    movl %ebx, %edi
 ; X64-NEXT:    callq take_char

diff --git a/llvm/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll b/llvm/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
@@ -8,7 +8,7 @@ define i1 @test_load_i1(ptr %p1) {
 ; CHECK-LABEL: test_load_i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl 4(%esp), %eax
-; CHECK-NEXT:    movb (%eax), %al
+; CHECK-NEXT:    movzbl (%eax), %eax
 ; CHECK-NEXT:    retl
   %r = load i1, ptr %p1
   ret i1 %r
@@ -18,7 +18,7 @@ define i8 @test_load_i8(ptr %p1) {
 ; CHECK-LABEL: test_load_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl 4(%esp), %eax
-; CHECK-NEXT:    movb (%eax), %al
+; CHECK-NEXT:    movzbl (%eax), %eax
 ; CHECK-NEXT:    retl
   %r = load i8, ptr %p1
   ret i8 %r

diff --git a/llvm/test/CodeGen/X86/GlobalISel/memop-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/memop-scalar.ll
@@ -5,7 +5,7 @@
 define i1 @test_load_i1(ptr %p1) {
 ; ALL-LABEL: test_load_i1:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    movb (%rdi), %al
+; ALL-NEXT:    movzbl (%rdi), %eax
 ; ALL-NEXT:    retq
   %r = load i1, ptr %p1
   ret i1 %r
@@ -14,7 +14,7 @@ define i1 @test_load_i1(ptr %p1) {
 define i8 @test_load_i8(ptr %p1) {
 ; ALL-LABEL: test_load_i8:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    movb (%rdi), %al
+; ALL-NEXT:    movzbl (%rdi), %eax
 ; ALL-NEXT:    retq
   %r = load i8, ptr %p1
   ret i8 %r

diff --git a/llvm/test/CodeGen/X86/PR40322.ll b/llvm/test/CodeGen/X86/PR40322.ll
@@ -15,7 +15,7 @@ define void @_Z2ami(i32) #0 personality ptr @__gxx_personality_v0 {
 ; CHECK-MINGW-X86-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-MINGW-X86-NEXT:    .cfi_offset %esi, -12
 ; CHECK-MINGW-X86-NEXT:    .cfi_offset %edi, -8
-; CHECK-MINGW-X86-NEXT:    movb __ZGVZ2amiE2au, %al
+; CHECK-MINGW-X86-NEXT:    movzbl __ZGVZ2amiE2au, %eax
 ; CHECK-MINGW-X86-NEXT:    testb %al, %al
 ; CHECK-MINGW-X86-NEXT:    jne LBB0_4
 ; CHECK-MINGW-X86-NEXT:  # %bb.1: # %init.check

diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
@@ -35,7 +35,7 @@ define i8 @test_i8(i8 %a) nounwind {
 ;
 ; X86-LABEL: test_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarb $7, %cl
 ; X86-NEXT:    xorb %cl, %al
@@ -530,13 +530,13 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind {
 ; X86-NEXT:    xorb %al, %bh
 ; X86-NEXT:    subb %al, %bh
 ; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarb $7, %al
 ; X86-NEXT:    xorb %al, %cl
 ; X86-NEXT:    subb %al, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarb $7, %al
 ; X86-NEXT:    xorb %al, %cl
@@ -572,7 +572,7 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind {
 ; X86-NEXT:    sarb $7, %al
 ; X86-NEXT:    xorb %al, %cl
 ; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movb %al, %ah
 ; X86-NEXT:    sarb $7, %ah
 ; X86-NEXT:    xorb %ah, %al
@@ -585,23 +585,23 @@ define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind {
 ; X86-NEXT:    movb %dh, 11(%esi)
 ; X86-NEXT:    movb %bl, 10(%esi)
 ; X86-NEXT:    movb %bh, 9(%esi)
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movb %al, 8(%esi)
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movb %al, 7(%esi)
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movb %al, 6(%esi)
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movb %al, 5(%esi)
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movb %al, 4(%esi)
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movb %al, 3(%esi)
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movb %al, 2(%esi)
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movb %al, 1(%esi)
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movb %al, (%esi)
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    addl $12, %esp

diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll
@@ -390,7 +390,7 @@ define i64 @test_i64_add_add_var(i64 %x, i64 %y, i64 %z, i64 %w) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -529,7 +529,7 @@ define i32 @test_i32_sub_add_sext_var(i32 %x, i32 %y, i32 %z, i32 %w) nounwind {
 ; X86-LABEL: test_i32_sub_add_sext_var:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    sarl $31, %edx

diff --git a/llvm/test/CodeGen/X86/and-load-fold.ll b/llvm/test/CodeGen/X86/and-load-fold.ll
@@ -6,7 +6,7 @@
 define i8 @foo(ptr %V) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movb 2(%rdi), %al
+; CHECK-NEXT:    movzbl 2(%rdi), %eax
 ; CHECK-NEXT:    andb $95, %al
 ; CHECK-NEXT:    retq
   %V3i8 = load <3 x i8>, ptr %V, align 4

diff --git a/llvm/test/CodeGen/X86/and-sink.ll b/llvm/test/CodeGen/X86/and-sink.ll
@@ -51,7 +51,7 @@ define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) {
 ; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    je .LBB1_5
 ; CHECK-NEXT:  # %bb.1: # %bb0.preheader
-; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB1_2: # %bb0

diff --git a/llvm/test/CodeGen/X86/and-with-overflow.ll b/llvm/test/CodeGen/X86/and-with-overflow.ll
@@ -9,7 +9,7 @@
 define i8 @and_i8_ri(i8 zeroext %0, i8 zeroext %1) {
 ; X86-LABEL: and_i8_ri:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andb $-17, %cl
 ; X86-NEXT:    je .LBB0_2
@@ -35,8 +35,8 @@ define i8 @and_i8_ri(i8 zeroext %0, i8 zeroext %1) {
 define i8 @and_i8_rr(i8 zeroext %0, i8 zeroext %1) {
 ; X86-LABEL: and_i8_rr:
 ; X86:       # %bb.0:
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    andb %al, %cl
 ; X86-NEXT:    je .LBB1_2
 ; X86-NEXT:  # %bb.1: