diff --git a/0009-Fix-compilation-error.patch b/0009-Fix-compilation-error.patch new file mode 100644 index 0000000000000000000000000000000000000000..0edb60ad1c3fc3c37461112775a77722364c3348 --- /dev/null +++ b/0009-Fix-compilation-error.patch @@ -0,0 +1,24 @@ +From 19362cc8cbb9c4e8fc06a81ed0e887f249142c6a Mon Sep 17 00:00:00 2001 +From: eastb233 +Date: Thu, 28 Aug 2025 17:05:53 +0800 +Subject: [PATCH] Fix compilation error + +--- + .../include/llvm/Transforms/Instrumentation/PGOInstrumentation.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h +index 6d2ad3d75744..5b1977b7de9a 100644 +--- a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h ++++ b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h +@@ -18,6 +18,7 @@ + #include "llvm/ADT/ArrayRef.h" + #include "llvm/ADT/IntrusiveRefCntPtr.h" + #include "llvm/IR/PassManager.h" ++#include "llvm/Support/CommandLine.h" + #include + #include + +-- +2.43.0 + diff --git a/0010-NFC-Fix-no-plt-test-cases.patch b/0010-NFC-Fix-no-plt-test-cases.patch new file mode 100644 index 0000000000000000000000000000000000000000..f304542916682b6d72fdcee25328b15aa0855165 --- /dev/null +++ b/0010-NFC-Fix-no-plt-test-cases.patch @@ -0,0 +1,71 @@ +From 31fc8b7c0a6c1d844c87b706dccb5c83bd3c3b2c Mon Sep 17 00:00:00 2001 +From: eastb233 +Date: Fri, 5 Sep 2025 16:21:22 +0800 +Subject: [PATCH] [NFC] Fix no-plt test cases + +--- + llvm/test/CodeGen/AArch64/fno-plt.c | 6 ++--- + llvm/test/CodeGen/AArch64/fno-plt.cpp | 35 --------------------------- + 2 files changed, 3 insertions(+), 38 deletions(-) + delete mode 100644 llvm/test/CodeGen/AArch64/fno-plt.cpp + +diff --git a/llvm/test/CodeGen/AArch64/fno-plt.c b/llvm/test/CodeGen/AArch64/fno-plt.c +index 81ed912ce927..6cc59b683da2 100644 +--- a/llvm/test/CodeGen/AArch64/fno-plt.c ++++ b/llvm/test/CodeGen/AArch64/fno-plt.c +@@ -1,8 +1,8 @@ +-// RUN: clang %s -shared -fno-plt -O2 -fno-inline -fPIC -o noplt.so ++// RUN: clang %s -shared -fno-plt -O2 -fno-inline -fPIC --target=aarch64-linux-gnu -fuse-ld=lld -nostdlib -o noplt.so + // RUN: llvm-objdump -d noplt.so | FileCheck %s --check-prefix=CHECK-NO-PLT + +-// RUN: clang %s -shared -O2 -fno-inline -fPIC -o plt.so +-// RUN: llvm-objdump -d plt.so | FileCheck %s --check-prefix=CHECK-PLT ++// RUN: clang %s -shared -O2 -fno-inline -fPIC --target=aarch64-linux-gnu -fuse-ld=lld -nostdlib -o plt.so ++// RUN: llvm-objdump -d plt.so | FileCheck %s --check-prefix=CHECK-PLT + + // CHECK-PLT: bar@plt + // CHECK-PLT: bar1@plt +diff --git a/llvm/test/CodeGen/AArch64/fno-plt.cpp b/llvm/test/CodeGen/AArch64/fno-plt.cpp +deleted file mode 100644 +index c5a1f2f24b37..000000000000 +--- a/llvm/test/CodeGen/AArch64/fno-plt.cpp ++++ /dev/null +@@ -1,35 +0,0 @@ +-// RUN: clang -x c++ %s -shared -fno-plt -O2 -fno-inline -fPIC -o noplt.so +-// RUN: llvm-objdump -d noplt.so | FileCheck %s --check-prefix=CHECK-NO-PLT +- +-// RUN: clang -x c++ %s -shared -O0 -fPIC -o plt.so +-// RUN: llvm-objdump -d plt.so | FileCheck %s --check-prefix=CHECK-PLT +- +-// RUN: clang -x c++ %s -shared -O2 -fno-inline -fPIC -o plt.so +-// RUN: llvm-objdump -d plt.so | FileCheck %s --check-prefix=CHECK-PLT +- +-// CHECK-PLT: bar@plt +-// CHECK-PLT: bar1@plt +-// CHECK-NO-PLT-NOT: bar@plt +-// CHECK-NO-PLT-NOT: bar1@plt +-// CHECK-NO-PLT-NOT: bar2@plt +- +-__attribute__((optnone)) +-void bar(int a) { +- return; +-} +- +-__attribute__((optnone)) +-extern void bar1(int); +- +-__attribute__((optnone)) +-static void bar2(int a) { +- return; +-} +- +-void foo(int a) { +- bar(a); +- bar1(a); +- bar2(a); +- return; +-} +- +-- +2.43.0 + diff --git a/0011-LoopDataPrefetch-Remove-preserved-analysis-info.patch b/0011-LoopDataPrefetch-Remove-preserved-analysis-info.patch new file mode 100644 index 0000000000000000000000000000000000000000..41be301e031eed5673c182cc2b9e9bdef0e24f78 --- /dev/null +++ b/0011-LoopDataPrefetch-Remove-preserved-analysis-info.patch @@ -0,0 +1,36 @@ +From 6cdf20ea854176c608c80983daccd91dd6d8073a Mon Sep 17 00:00:00 2001 +From: eastb233 +Date: Wed, 3 Sep 2025 10:10:58 +0800 +Subject: [PATCH] [LoopDataPrefetch] Remove preserved analysis info + +Since newly added features may clone loops/loads/BB etc, +do not preserve analysis info. +--- + llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +index dde7de406c58..ea3d7582f808 100644 +--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp ++++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +@@ -241,17 +241,12 @@ public: + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); +- AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); +- AU.addPreserved(); + AU.addRequired(); +- AU.addPreserved(); + AU.addRequiredID(LoopSimplifyID); +- AU.addPreservedID(LoopSimplifyID); + AU.addRequired(); + AU.addRequired(); +- AU.addPreserved(); + AU.addRequired(); + } + +-- +2.43.0 + diff --git a/0012-AArch64-Fix-disable-lse-not-working.patch b/0012-AArch64-Fix-disable-lse-not-working.patch new file mode 100644 index 0000000000000000000000000000000000000000..fa96ca89da0528550d67542db62f75f600ba2717 --- /dev/null +++ b/0012-AArch64-Fix-disable-lse-not-working.patch @@ -0,0 +1,333 @@ +From 5091043d9a995f3edfd8ab36c7c42c631603174d Mon Sep 17 00:00:00 2001 +From: xiajingze +Date: Tue, 19 Aug 2025 14:51:32 +0800 +Subject: [PATCH] [AArch64] Fix disable lse not working + +Commit defb9334 to disable lse by default is not working. This patch +fix it. +--- + clang/test/CodeGen/aarch64-targetattr.c | 8 +- + .../llvm/TargetParser/AArch64TargetParser.h | 101 ++++++++++-------- + llvm/test/MC/AArch64/directive-arch.s | 2 +- + .../TargetParser/TargetParserTest.cpp | 8 +- + 4 files changed, 68 insertions(+), 51 deletions(-) + +diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c +index 9664b723a2b2..1e387cc97683 100644 +--- a/clang/test/CodeGen/aarch64-targetattr.c ++++ b/clang/test/CodeGen/aarch64-targetattr.c +@@ -94,10 +94,10 @@ void nosimd() {} + __attribute__((target("no-v9.3a"))) + void minusarch() {} + +-// CHECK: attributes #0 = { {{.*}} "target-features"="+crc,+fp-armv8,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" } +-// CHECK: attributes #1 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" } +-// CHECK: attributes #2 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" } +-// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+lse,+neon,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" } ++// CHECK: attributes #0 = { {{.*}} "target-features"="+crc,+fp-armv8,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" } ++// CHECK: attributes #1 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" } ++// CHECK: attributes #2 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" } ++// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+neon,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" } + // CHECK: attributes #4 = { {{.*}} "target-cpu"="cortex-a710" "target-features"="+bf16,+crc,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+lse,+mte,+neon,+pauth,+ras,+rcpc,+rdm,+sb,+sve,+sve2,+sve2-bitperm" } + // CHECK: attributes #5 = { {{.*}} "tune-cpu"="cortex-a710" } + // CHECK: attributes #6 = { {{.*}} "target-cpu"="generic" } +diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h +index c21bff14ece9..78e6ced92627 100644 +--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h ++++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h +@@ -320,7 +320,7 @@ struct ArchInfo { + + // clang-format off + inline constexpr ArchInfo ARMV8A = { VersionTuple{8, 0}, AProfile, "armv8-a", "+v8a", (AArch64::AEK_FP | AArch64::AEK_SIMD), }; +-inline constexpr ArchInfo ARMV8_1A = { VersionTuple{8, 1}, AProfile, "armv8.1-a", "+v8.1a", (ARMV8A.DefaultExts | AArch64::AEK_CRC | AArch64::AEK_LSE | AArch64::AEK_RDM)}; ++inline constexpr ArchInfo ARMV8_1A = { VersionTuple{8, 1}, AProfile, "armv8.1-a", "+v8.1a", (ARMV8A.DefaultExts | AArch64::AEK_CRC | AArch64::AEK_RDM)}; + inline constexpr ArchInfo ARMV8_2A = { VersionTuple{8, 2}, AProfile, "armv8.2-a", "+v8.2a", (ARMV8_1A.DefaultExts | AArch64::AEK_RAS)}; + inline constexpr ArchInfo ARMV8_3A = { VersionTuple{8, 3}, AProfile, "armv8.3-a", "+v8.3a", (ARMV8_2A.DefaultExts | AArch64::AEK_RCPC)}; + inline constexpr ArchInfo ARMV8_4A = { VersionTuple{8, 4}, AProfile, "armv8.4-a", "+v8.4a", (ARMV8_3A.DefaultExts | AArch64::AEK_DOTPROD)}; +@@ -366,106 +366,113 @@ inline constexpr CpuInfo CpuInfos[] = { + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_CRC)}, + {"cortex-a55", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | +- AArch64::AEK_DOTPROD | AArch64::AEK_RCPC)}, ++ AArch64::AEK_DOTPROD | AArch64::AEK_RCPC | AArch64::AEK_LSE)}, + {"cortex-a510", ARMV9A, + (AArch64::AEK_BF16 | AArch64::AEK_I8MM | AArch64::AEK_SB | + AArch64::AEK_PAUTH | AArch64::AEK_MTE | AArch64::AEK_SSBS | + AArch64::AEK_SVE | AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM | +- AArch64::AEK_FP16FML)}, ++ AArch64::AEK_FP16FML | AArch64::AEK_LSE)}, + {"cortex-a57", ARMV8A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_CRC)}, + {"cortex-a65", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_DOTPROD | +- AArch64::AEK_FP16 | AArch64::AEK_RCPC | AArch64::AEK_SSBS)}, ++ AArch64::AEK_FP16 | AArch64::AEK_RCPC | AArch64::AEK_SSBS | ++ AArch64::AEK_LSE)}, + {"cortex-a65ae", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_DOTPROD | +- AArch64::AEK_FP16 | AArch64::AEK_RCPC | AArch64::AEK_SSBS)}, ++ AArch64::AEK_FP16 | AArch64::AEK_RCPC | AArch64::AEK_SSBS | ++ AArch64::AEK_LSE)}, + {"cortex-a72", ARMV8A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_CRC)}, + {"cortex-a73", ARMV8A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_CRC)}, + {"cortex-a75", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | +- AArch64::AEK_DOTPROD | AArch64::AEK_RCPC)}, ++ AArch64::AEK_DOTPROD | AArch64::AEK_RCPC | AArch64::AEK_LSE)}, + {"cortex-a76", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | +- AArch64::AEK_DOTPROD | AArch64::AEK_RCPC | AArch64::AEK_SSBS)}, ++ AArch64::AEK_DOTPROD | AArch64::AEK_RCPC | AArch64::AEK_SSBS | ++ AArch64::AEK_LSE)}, + {"cortex-a76ae", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | +- AArch64::AEK_DOTPROD | AArch64::AEK_RCPC | AArch64::AEK_SSBS)}, ++ AArch64::AEK_DOTPROD | AArch64::AEK_RCPC | AArch64::AEK_SSBS | ++ AArch64::AEK_LSE)}, + {"cortex-a77", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | +- AArch64::AEK_RCPC | AArch64::AEK_DOTPROD | AArch64::AEK_SSBS)}, ++ AArch64::AEK_RCPC | AArch64::AEK_DOTPROD | AArch64::AEK_SSBS | ++ AArch64::AEK_LSE)}, + {"cortex-a78", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | + AArch64::AEK_DOTPROD | AArch64::AEK_RCPC | AArch64::AEK_SSBS | +- AArch64::AEK_PROFILE)}, ++ AArch64::AEK_PROFILE | AArch64::AEK_LSE)}, + {"cortex-a78c", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | + AArch64::AEK_DOTPROD | AArch64::AEK_RCPC | AArch64::AEK_SSBS | + AArch64::AEK_PROFILE | AArch64::AEK_FLAGM | AArch64::AEK_PAUTH | +- AArch64::AEK_FP16FML)}, ++ AArch64::AEK_FP16FML | AArch64::AEK_LSE)}, + {"cortex-a710", ARMV9A, + (AArch64::AEK_MTE | AArch64::AEK_PAUTH | AArch64::AEK_FLAGM | + AArch64::AEK_SB | AArch64::AEK_I8MM | AArch64::AEK_FP16FML | + AArch64::AEK_SVE | AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM | +- AArch64::AEK_BF16)}, ++ AArch64::AEK_BF16 | AArch64::AEK_LSE)}, + {"cortex-a715", ARMV9A, + (AArch64::AEK_SB | AArch64::AEK_SSBS | AArch64::AEK_MTE | + AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_PAUTH | + AArch64::AEK_I8MM | AArch64::AEK_PREDRES | AArch64::AEK_PERFMON | + AArch64::AEK_PROFILE | AArch64::AEK_SVE | AArch64::AEK_SVE2BITPERM | +- AArch64::AEK_BF16 | AArch64::AEK_FLAGM)}, ++ AArch64::AEK_BF16 | AArch64::AEK_FLAGM | AArch64::AEK_LSE)}, + {"cortex-r82", ARMV8R, (AArch64::AEK_LSE)}, + {"cortex-x1", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | + AArch64::AEK_DOTPROD | AArch64::AEK_RCPC | AArch64::AEK_SSBS | +- AArch64::AEK_PROFILE)}, ++ AArch64::AEK_PROFILE | AArch64::AEK_LSE)}, + {"cortex-x1c", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | + AArch64::AEK_DOTPROD | AArch64::AEK_RCPC | AArch64::AEK_SSBS | +- AArch64::AEK_PAUTH | AArch64::AEK_PROFILE)}, ++ AArch64::AEK_PAUTH | AArch64::AEK_PROFILE | AArch64::AEK_LSE)}, + {"cortex-x2", ARMV9A, + (AArch64::AEK_MTE | AArch64::AEK_BF16 | AArch64::AEK_I8MM | + AArch64::AEK_PAUTH | AArch64::AEK_SSBS | AArch64::AEK_SB | + AArch64::AEK_SVE | AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM | +- AArch64::AEK_FP16FML)}, ++ AArch64::AEK_FP16FML | AArch64::AEK_LSE)}, + {"cortex-x3", ARMV9A, + (AArch64::AEK_SVE | AArch64::AEK_PERFMON | AArch64::AEK_PROFILE | + AArch64::AEK_BF16 | AArch64::AEK_I8MM | AArch64::AEK_MTE | + AArch64::AEK_SVE2BITPERM | AArch64::AEK_SB | AArch64::AEK_PAUTH | + AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_PREDRES | +- AArch64::AEK_FLAGM | AArch64::AEK_SSBS)}, ++ AArch64::AEK_FLAGM | AArch64::AEK_SSBS | AArch64::AEK_LSE)}, + {"neoverse-e1", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_DOTPROD | +- AArch64::AEK_FP16 | AArch64::AEK_RCPC | AArch64::AEK_SSBS)}, ++ AArch64::AEK_FP16 | AArch64::AEK_RCPC | AArch64::AEK_SSBS | ++ AArch64::AEK_LSE)}, + {"neoverse-n1", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_DOTPROD | + AArch64::AEK_FP16 | AArch64::AEK_PROFILE | AArch64::AEK_RCPC | +- AArch64::AEK_SSBS)}, ++ AArch64::AEK_SSBS | AArch64::AEK_LSE)}, + {"neoverse-n2", ARMV8_5A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 | + AArch64::AEK_SM4 | AArch64::AEK_BF16 | AArch64::AEK_DOTPROD | + AArch64::AEK_FP16 | AArch64::AEK_I8MM | AArch64::AEK_MTE | + AArch64::AEK_SB | AArch64::AEK_SSBS | AArch64::AEK_SVE | +- AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM)}, ++ AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM | AArch64::AEK_LSE)}, + {"neoverse-512tvb", ARMV8_4A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 | + AArch64::AEK_SM4 | AArch64::AEK_SVE | AArch64::AEK_SSBS | + AArch64::AEK_FP16 | AArch64::AEK_BF16 | AArch64::AEK_DOTPROD | + AArch64::AEK_PROFILE | AArch64::AEK_RAND | AArch64::AEK_FP16FML | +- AArch64::AEK_I8MM)}, ++ AArch64::AEK_I8MM | AArch64::AEK_LSE)}, + {"neoverse-v1", ARMV8_4A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 | + AArch64::AEK_SM4 | AArch64::AEK_SVE | AArch64::AEK_SSBS | + AArch64::AEK_FP16 | AArch64::AEK_BF16 | AArch64::AEK_DOTPROD | + AArch64::AEK_PROFILE | AArch64::AEK_RAND | AArch64::AEK_FP16FML | +- AArch64::AEK_I8MM)}, ++ AArch64::AEK_I8MM | AArch64::AEK_LSE)}, + {"neoverse-v2", ARMV9A, + (AArch64::AEK_SVE | AArch64::AEK_SVE2 | AArch64::AEK_SSBS | + AArch64::AEK_FP16 | AArch64::AEK_BF16 | AArch64::AEK_RAND | + AArch64::AEK_DOTPROD | AArch64::AEK_PROFILE | AArch64::AEK_SVE2BITPERM | +- AArch64::AEK_FP16FML | AArch64::AEK_I8MM | AArch64::AEK_MTE)}, ++ AArch64::AEK_FP16FML | AArch64::AEK_I8MM | AArch64::AEK_MTE | ++ AArch64::AEK_LSE)}, + {"cyclone", ARMV8A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_NONE)}, + {"apple-a7", ARMV8A, +@@ -478,50 +485,58 @@ inline constexpr CpuInfo CpuInfos[] = { + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_CRC | + AArch64::AEK_RDM)}, + {"apple-a11", ARMV8_2A, +- (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16)}, ++ (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | ++ AArch64::AEK_LSE)}, + {"apple-a12", ARMV8_3A, +- (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16)}, ++ (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | ++ AArch64::AEK_LSE)}, + {"apple-a13", ARMV8_4A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 | +- AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3)}, ++ AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3 | ++ AArch64::AEK_LSE)}, + {"apple-a14", ARMV8_5A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 | +- AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3)}, ++ AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3 | ++ AArch64::AEK_LSE)}, + {"apple-a15", ARMV8_5A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 | + AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3 | +- AArch64::AEK_BF16 | AArch64::AEK_I8MM)}, ++ AArch64::AEK_BF16 | AArch64::AEK_I8MM | AArch64::AEK_LSE)}, + {"apple-a16", ARMV8_5A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 | + AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3 | +- AArch64::AEK_BF16 | AArch64::AEK_I8MM)}, ++ AArch64::AEK_BF16 | AArch64::AEK_I8MM | AArch64::AEK_LSE)}, + {"apple-m1", ARMV8_5A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 | +- AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3)}, ++ AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3 | ++ AArch64::AEK_LSE)}, + {"apple-m2", ARMV8_5A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 | + AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_SHA3 | +- AArch64::AEK_BF16 | AArch64::AEK_I8MM)}, ++ AArch64::AEK_BF16 | AArch64::AEK_I8MM | AArch64::AEK_LSE)}, + {"apple-s4", ARMV8_3A, +- (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16)}, ++ (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | ++ AArch64::AEK_LSE)}, + {"apple-s5", ARMV8_3A, +- (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16)}, ++ (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | ++ AArch64::AEK_LSE)}, + {"exynos-m3", ARMV8A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_CRC)}, + {"exynos-m4", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_DOTPROD | +- AArch64::AEK_FP16)}, ++ AArch64::AEK_FP16 | AArch64::AEK_LSE)}, + {"exynos-m5", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_DOTPROD | +- AArch64::AEK_FP16)}, ++ AArch64::AEK_FP16 | AArch64::AEK_LSE)}, + {"falkor", ARMV8A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_CRC | + AArch64::AEK_RDM)}, + {"saphira", ARMV8_3A, +- (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_PROFILE)}, ++ (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_PROFILE | ++ AArch64::AEK_LSE)}, + {"kryo", ARMV8A, (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_CRC)}, +- {"thunderx2t99", ARMV8_1A, (AArch64::AEK_AES | AArch64::AEK_SHA2)}, +- {"thunderx3t110", ARMV8_3A, (AArch64::AEK_AES | AArch64::AEK_SHA2)}, ++ {"thunderx2t99", ARMV8_1A, (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_LSE)}, ++ {"thunderx3t110", ARMV8_3A, (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_LSE)}, + {"thunderx", ARMV8A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_CRC)}, + {"thunderxt88", ARMV8A, +@@ -535,17 +550,19 @@ inline constexpr CpuInfo CpuInfos[] = { + AArch64::AEK_FP16 | AArch64::AEK_FP16FML | AArch64::AEK_PROFILE)}, + {"a64fx", ARMV8_2A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | +- AArch64::AEK_SVE)}, ++ AArch64::AEK_SVE | AArch64::AEK_LSE)}, + {"carmel", ARMV8_2A, +- (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16)}, ++ (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_FP16 | ++ AArch64::AEK_LSE)}, + {"ampere1", ARMV8_6A, + (AArch64::AEK_AES | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 | + AArch64::AEK_FP16 | AArch64::AEK_SB | AArch64::AEK_SSBS | +- AArch64::AEK_RAND)}, ++ AArch64::AEK_RAND | AArch64::AEK_LSE)}, + {"ampere1a", ARMV8_6A, + (AArch64::AEK_FP16 | AArch64::AEK_RAND | AArch64::AEK_SM4 | + AArch64::AEK_SHA3 | AArch64::AEK_SHA2 | AArch64::AEK_AES | +- AArch64::AEK_MTE | AArch64::AEK_SB | AArch64::AEK_SSBS)}, ++ AArch64::AEK_MTE | AArch64::AEK_SB | AArch64::AEK_SSBS | ++ AArch64::AEK_LSE)}, + {"hip09", ARMV8_5A, + (AArch64::AEK_AES | AArch64::AEK_SM4 | AArch64::AEK_SHA2 | + AArch64::AEK_SHA3 | AArch64::AEK_FP16 | AArch64::AEK_PROFILE | +diff --git a/llvm/test/MC/AArch64/directive-arch.s b/llvm/test/MC/AArch64/directive-arch.s +index 32840b4cd142..c036b7da922e 100644 +--- a/llvm/test/MC/AArch64/directive-arch.s ++++ b/llvm/test/MC/AArch64/directive-arch.s +@@ -8,7 +8,7 @@ + # CHECK: aesd v0.16b, v2.16b + # CHECK: eor v0.16b, v0.16b, v2.16b + +- .arch armv8.1-a ++ .arch armv8.1-a+lse + casa w5, w7, [x20] + # CHECK: casa w5, w7, [x20] + +diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp +index 3762ae115653..ead36ed5d895 100644 +--- a/llvm/unittests/TargetParser/TargetParserTest.cpp ++++ b/llvm/unittests/TargetParser/TargetParserTest.cpp +@@ -1417,14 +1417,14 @@ INSTANTIATE_TEST_SUITE_P( + AArch64::AEK_CRC | AArch64::AEK_AES | + AArch64::AEK_SHA2 | AArch64::AEK_FP | + AArch64::AEK_SIMD | AArch64::AEK_RAS | +- AArch64::AEK_LSE | AArch64::AEK_RDM | ++ AArch64::AEK_RDM | + AArch64::AEK_PROFILE | AArch64::AEK_FP16 | + AArch64::AEK_FP16FML | AArch64::AEK_DOTPROD, + "8.2-A"), + ARMCPUTestParams( + "hip09", "armv8.5-a", "crypto-neon-fp-armv8", + AArch64::AEK_CRC | AArch64::AEK_FP | AArch64::AEK_SIMD | +- AArch64::AEK_RAS | AArch64::AEK_LSE | AArch64::AEK_RDM | ++ AArch64::AEK_RAS | AArch64::AEK_RDM | + AArch64::AEK_RCPC | AArch64::AEK_DOTPROD | AArch64::AEK_AES | + AArch64::AEK_SM4 | AArch64::AEK_SHA2 | AArch64::AEK_SHA3 | + AArch64::AEK_FP16 | AArch64::AEK_PROFILE | +@@ -1440,13 +1440,13 @@ INSTANTIATE_TEST_SUITE_P( + AArch64::AEK_SHA3 | AArch64::AEK_SM4 | AArch64::AEK_SSBS | + AArch64::AEK_SVE | AArch64::AEK_BF16 | AArch64::AEK_CRC | + AArch64::AEK_DOTPROD | AArch64::AEK_FP | AArch64::AEK_I8MM | +- AArch64::AEK_LSE | AArch64::AEK_RAS | AArch64::AEK_RCPC | ++ AArch64::AEK_RAS | AArch64::AEK_RCPC | + AArch64::AEK_RDM | AArch64::AEK_SIMD, + "8.5-A"), + ARMCPUTestParams( + "hip11", "armv9-a", "neon-fp-armv8", + AArch64::AEK_CRC | AArch64::AEK_FP | AArch64::AEK_SIMD | +- AArch64::AEK_RAS | AArch64::AEK_LSE | AArch64::AEK_RDM | ++ AArch64::AEK_RAS | AArch64::AEK_RDM | + AArch64::AEK_RCPC | AArch64::AEK_SVE | AArch64::AEK_SVE2 | + AArch64::AEK_DOTPROD | AArch64::AEK_MTE | AArch64::AEK_FP16FML | + AArch64::AEK_FP16 | AArch64::AEK_SVE2BITPERM | +-- +2.43.0 + diff --git a/0013-LoopVersioningLICM-Only-mark-pointers-with-generated.patch b/0013-LoopVersioningLICM-Only-mark-pointers-with-generated.patch new file mode 100644 index 0000000000000000000000000000000000000000..dfca2a5ef2bb65ac1ba2709169b669d569e78186 --- /dev/null +++ b/0013-LoopVersioningLICM-Only-mark-pointers-with-generated.patch @@ -0,0 +1,497 @@ +From 856e854dc80191e1737c0ce02d15d971c4ca3a99 Mon Sep 17 00:00:00 2001 +From: John Brawn +Date: Mon, 12 May 2025 10:15:22 +0100 +Subject: [PATCH] [LoopVersioningLICM] Only mark pointers with generated checks + as noalias (#135168) + +Currently when we version a loop all loads and stores have the noalias +metadata added to them. If there were some pointers that could not be +analysed, and thus we could not generate runtime aliasing checks for, +then we should not mark loads and stores using these pointers as +noalias. + +This is done by getting rid of setNoAliasToLoop and instead using +annotateLoopWithNoAlias, as that already correctly handles partial alias +information. This does result in slightly different aliasing metadata +being generated, but it looks like it's more precise. + +Currently this doesn't result in any change to the transforms that +LoopVersioningLICM does, as LoopAccessAnalysis discards all results if +it couldn't analyse every pointer leading to no loop versioning +happening, but an upcoming patch will change that and we need this first +otherwise we incorrectly mark some pointers as noalias even when they +aren't. +--- + .../Transforms/Scalar/LoopVersioningLICM.cpp | 59 +--- + .../load-from-unknown-address.ll | 307 ++++++++++++++++++ + .../LoopVersioningLICM/loopversioningLICM1.ll | 8 +- + .../LoopVersioningLICM/loopversioningLICM2.ll | 2 +- + 4 files changed, 327 insertions(+), 49 deletions(-) + create mode 100644 llvm/test/Transforms/LoopVersioningLICM/load-from-unknown-address.ll + +diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +index 93800d08b789..1d12fae1340b 100644 +--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp ++++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +@@ -178,7 +178,6 @@ private: + bool legalLoopInstructions(); + bool legalLoopMemoryAccesses(); + bool isLoopAlreadyVisited(); +- void setNoAliasToLoop(Loop *VerLoop); + bool instructionSafeForVersioning(Instruction *I); + bool legalLoopVersioningOverlap(); + }; +@@ -354,6 +353,13 @@ bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) { + } + LoadAndStoreCounter++; + Value *Ptr = St->getPointerOperand(); ++ // Don't allow stores that we don't have runtime checks for, as we won't be ++ // able to mark them noalias meaning they would prevent any code motion. ++ auto &Pointers = LAI->getRuntimePointerChecking()->Pointers; ++ if (!any_of(Pointers, [&](auto &P) { return P.PointerValue == Ptr; })) { ++ LLVM_DEBUG(dbgs() << " Found a store without a runtime check.\n"); ++ return false; ++ } + // Check loop invariant. + if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop)) + InvariantCounter++; +@@ -371,6 +377,13 @@ bool LoopVersioningLICM::legalLoopInstructions() { + InvariantCounter = 0; + IsReadOnlyLoop = true; + using namespace ore; ++ // Get LoopAccessInfo from current loop via the proxy. ++ LAI = &LAIs.getInfo(*CurLoop); ++ // Check LoopAccessInfo for need of runtime check. ++ if (LAI->getRuntimePointerChecking()->getChecks().empty()) { ++ LLVM_DEBUG(dbgs() << " LAA: Runtime check not found !!\n"); ++ return false; ++ } + // Iterate over loop blocks and instructions of each block and check + // instruction safety. + for (auto *Block : CurLoop->getBlocks()) +@@ -384,13 +397,6 @@ bool LoopVersioningLICM::legalLoopInstructions() { + return false; + } + } +- // Get LoopAccessInfo from current loop via the proxy. +- LAI = &LAIs.getInfo(*CurLoop); +- // Check LoopAccessInfo for need of runtime check. +- if (LAI->getRuntimePointerChecking()->getChecks().empty()) { +- LLVM_DEBUG(dbgs() << " LAA: Runtime check not found !!\n"); +- return false; +- } + // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold + if (LAI->getNumRuntimePointerChecks() > + VectorizerParams::RuntimeMemoryCheckThreshold) { +@@ -587,41 +593,6 @@ bool LoopVersioningLICM::isLegalForVersioning() { + return true; + } + +-/// Update loop with aggressive aliasing assumptions. +-/// It marks no-alias to any pairs of memory operations by assuming +-/// loop should not have any must-alias memory accesses pairs. +-/// During LoopVersioningLICM legality we ignore loops having must +-/// aliasing memory accesses. +-void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) { +- // Get latch terminator instruction. +- Instruction *I = VerLoop->getLoopLatch()->getTerminator(); +- // Create alias scope domain. +- MDBuilder MDB(I->getContext()); +- MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain"); +- StringRef Name = "LVAliasScope"; +- MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name); +- SmallVector Scopes{NewScope}, NoAliases{NewScope}; +- // Iterate over each instruction of loop. +- // set no-alias for all load & store instructions. +- for (auto *Block : CurLoop->getBlocks()) { +- for (auto &Inst : *Block) { +- // Only interested in instruction that may modify or read memory. +- if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory()) +- continue; +- // Set no-alias for current instruction. +- Inst.setMetadata( +- LLVMContext::MD_noalias, +- MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_noalias), +- MDNode::get(Inst.getContext(), NoAliases))); +- // set alias-scope for current instruction. +- Inst.setMetadata( +- LLVMContext::MD_alias_scope, +- MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_alias_scope), +- MDNode::get(Inst.getContext(), Scopes))); +- } +- } +-} +- + bool LoopVersioningLICM::run(DominatorTree *DT) { + // Do not do the transformation if disabled by metadata. + if (hasLICMVersioningTransformation(CurLoop) & TM_Disable) +@@ -709,7 +680,7 @@ bool LoopVersioningLICM::run(DominatorTree *DT) { + addStringMetadataToLoop(LVer.getVersionedLoop(), + "llvm.mem.parallel_loop_access"); + // Update version loop with aggressive aliasing assumption. +- setNoAliasToLoop(LVer.getVersionedLoop()); ++ LVer.annotateLoopWithNoAlias(); + Changed = true; + } + return Changed; +diff --git a/llvm/test/Transforms/LoopVersioningLICM/load-from-unknown-address.ll b/llvm/test/Transforms/LoopVersioningLICM/load-from-unknown-address.ll +new file mode 100644 +index 000000000000..c0d9f062a99f +--- /dev/null ++++ b/llvm/test/Transforms/LoopVersioningLICM/load-from-unknown-address.ll +@@ -0,0 +1,307 @@ ++; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ++; RUN: opt < %s -S -passes='function(loop-versioning-licm,loop-mssa(licm))' | FileCheck %s ++ ++target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" ++ ++; In these tests we have a loop where we can calculate the bounds of some memory ++; accesses but not others. ++ ++; Load from a gep whose bounds can't be calculated as the offset is loaded from memory ++; FIXME: Not knowing the bounds of the gep shouldn't stop us from hoisting the load of rval ++define void @gep_loaded_offset(ptr %p, ptr %q, ptr %r, i32 %n) { ++; CHECK-LABEL: define void @gep_loaded_offset ++; CHECK-SAME: (ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]], i32 [[N:%.*]]) { ++; CHECK-NEXT: entry: ++; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ++; CHECK: while.body: ++; CHECK-NEXT: [[N_ADDR:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[N]], [[ENTRY:%.*]] ] ++; CHECK-NEXT: [[P_ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[P]], [[ENTRY]] ] ++; CHECK-NEXT: [[DEC]] = add nsw i32 [[N_ADDR]], -1 ++; CHECK-NEXT: [[RVAL:%.*]] = load i64, ptr [[R]], align 4 ++; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[RVAL]] ++; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ++; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_ADDR]], i64 4 ++; CHECK-NEXT: store i32 [[VAL]], ptr [[P_ADDR]], align 4 ++; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ++; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] ++; CHECK: while.end: ++; CHECK-NEXT: ret void ++; ++entry: ++ br label %while.body ++ ++while.body: ++ %n.addr = phi i32 [ %dec, %while.body ], [ %n, %entry ] ++ %p.addr = phi ptr [ %incdec.ptr, %while.body ], [ %p, %entry ] ++ %dec = add nsw i32 %n.addr, -1 ++ %rval = load i64, ptr %r, align 4 ++ %arrayidx = getelementptr inbounds i32, ptr %q, i64 %rval ++ %val = load i32, ptr %arrayidx, align 4 ++ %incdec.ptr = getelementptr inbounds i8, ptr %p.addr, i64 4 ++ store i32 %val, ptr %p.addr, align 4 ++ %tobool.not = icmp eq i32 %dec, 0 ++ br i1 %tobool.not, label %while.end, label %while.body ++ ++while.end: ++ ret void ++} ++ ++; As above but with a store to the loaded address. This should prevent the loop ++; from being versioned, as we wouldn't be able to do any code motion. ++define void @gep_loaded_offset_with_store(ptr %p, ptr %q, ptr %r, i32 %n) { ++; CHECK-LABEL: define void @gep_loaded_offset_with_store ++; CHECK-SAME: (ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]], i32 [[N:%.*]]) { ++; CHECK-NEXT: entry: ++; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ++; CHECK: while.body: ++; CHECK-NEXT: [[N_ADDR:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[N]], [[ENTRY:%.*]] ] ++; CHECK-NEXT: [[P_ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[P]], [[ENTRY]] ] ++; CHECK-NEXT: [[DEC]] = add nsw i32 [[N_ADDR]], -1 ++; CHECK-NEXT: [[RVAL:%.*]] = load i64, ptr [[R]], align 4 ++; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[RVAL]] ++; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ++; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4 ++; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_ADDR]], i64 4 ++; CHECK-NEXT: store i32 [[VAL]], ptr [[P_ADDR]], align 4 ++; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ++; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] ++; CHECK: while.end: ++; CHECK-NEXT: ret void ++; ++entry: ++ br label %while.body ++ ++while.body: ++ %n.addr = phi i32 [ %dec, %while.body ], [ %n, %entry ] ++ %p.addr = phi ptr [ %incdec.ptr, %while.body ], [ %p, %entry ] ++ %dec = add nsw i32 %n.addr, -1 ++ %rval = load i64, ptr %r, align 4 ++ %arrayidx = getelementptr inbounds i32, ptr %q, i64 %rval ++ %val = load i32, ptr %arrayidx, align 4 ++ store i32 0, ptr %arrayidx, align 4 ++ %incdec.ptr = getelementptr inbounds i8, ptr %p.addr, i64 4 ++ store i32 %val, ptr %p.addr, align 4 ++ %tobool.not = icmp eq i32 %dec, 0 ++ br i1 %tobool.not, label %while.end, label %while.body ++ ++while.end: ++ ret void ++} ++ ++; Load from a gep whose bounds can't be calculated as the pointer is loaded from memory ++; FIXME: Not knowing the bounds of the gep shouldn't stop us from hoisting the load of rval ++define void @gep_loaded_base(ptr %p, ptr %q, ptr %r, i32 %n) { ++; CHECK-LABEL: define void @gep_loaded_base ++; CHECK-SAME: (ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]], i32 [[N:%.*]]) { ++; CHECK-NEXT: entry: ++; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ++; CHECK: while.body: ++; CHECK-NEXT: [[N_ADDR:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[N]], [[ENTRY:%.*]] ] ++; CHECK-NEXT: [[P_ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[P]], [[ENTRY]] ] ++; CHECK-NEXT: [[DEC]] = add nsw i32 [[N_ADDR]], -1 ++; CHECK-NEXT: [[RVAL:%.*]] = load ptr, ptr [[R]], align 4 ++; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[RVAL]], i64 0 ++; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ++; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_ADDR]], i64 4 ++; CHECK-NEXT: store i32 [[VAL]], ptr [[P_ADDR]], align 4 ++; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ++; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] ++; CHECK: while.end: ++; CHECK-NEXT: ret void ++; ++entry: ++ br label %while.body ++ ++while.body: ++ %n.addr = phi i32 [ %dec, %while.body ], [ %n, %entry ] ++ %p.addr = phi ptr [ %incdec.ptr, %while.body ], [ %p, %entry ] ++ %dec = add nsw i32 %n.addr, -1 ++ %rval = load ptr, ptr %r, align 4 ++ %arrayidx = getelementptr inbounds i32, ptr %rval, i64 0 ++ %val = load i32, ptr %arrayidx, align 4 ++ %incdec.ptr = getelementptr inbounds i8, ptr %p.addr, i64 4 ++ store i32 %val, ptr %p.addr, align 4 ++ %tobool.not = icmp eq i32 %dec, 0 ++ br i1 %tobool.not, label %while.end, label %while.body ++ ++while.end: ++ ret void ++} ++ ++; Load from a gep with an offset that scalar evolution can't describe ++; FIXME: Not knowing the bounds of the gep shouldn't stop us from hoisting the load of qval ++define void @gep_strange_offset(ptr %p, ptr %q, ptr %r, i32 %n) { ++; CHECK-LABEL: define void @gep_strange_offset ++; CHECK-SAME: (ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[R:%.*]], i32 [[N:%.*]]) { ++; CHECK-NEXT: entry: ++; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ++; CHECK: while.body: ++; CHECK-NEXT: [[N_ADDR:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[N]], [[ENTRY:%.*]] ] ++; CHECK-NEXT: [[P_ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[P]], [[ENTRY]] ] ++; CHECK-NEXT: [[DEC]] = add nsw i32 [[N_ADDR]], -1 ++; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[Q]], align 4 ++; CHECK-NEXT: [[REM:%.*]] = srem i32 [[DEC]], 2 ++; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[REM]] to i64 ++; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[IDXPROM]] ++; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ++; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[VAL]], [[QVAL]] ++; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_ADDR]], i64 4 ++; CHECK-NEXT: store i32 [[ADD]], ptr [[P_ADDR]], align 4 ++; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ++; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] ++; CHECK: while.end: ++; CHECK-NEXT: ret void ++; ++entry: ++ br label %while.body ++ ++while.body: ++ %n.addr = phi i32 [ %dec, %while.body ], [ %n, %entry ] ++ %p.addr = phi ptr [ %incdec.ptr, %while.body ], [ %p, %entry ] ++ %dec = add nsw i32 %n.addr, -1 ++ %qval = load i32, ptr %q, align 4 ++ %rem = srem i32 %dec, 2 ++ %idxprom = sext i32 %rem to i64 ++ %arrayidx = getelementptr inbounds i32, ptr %r, i64 %idxprom ++ %val = load i32, ptr %arrayidx, align 4 ++ %add = add nsw i32 %val, %qval ++ %incdec.ptr = getelementptr inbounds i8, ptr %p.addr, i64 4 ++ store i32 %add, ptr %p.addr, align 4 ++ %tobool.not = icmp eq i32 %dec, 0 ++ br i1 %tobool.not, label %while.end, label %while.body ++ ++while.end: ++ ret void ++} ++ ++; A memcpy-like loop where the source address is loaded from a pointer ++; FIXME: We should be able to hoist the load of the source address pointer ++define void @memcpy_load_src(ptr %dst, ptr %src, i32 %n) { ++; CHECK-LABEL: define void @memcpy_load_src ++; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[N:%.*]]) { ++; CHECK-NEXT: entry: ++; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ++; CHECK: while.body: ++; CHECK-NEXT: [[N_VAL:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[N]], [[ENTRY:%.*]] ] ++; CHECK-NEXT: [[DST_VAL:%.*]] = phi ptr [ [[DST_VAL_NEXT:%.*]], [[WHILE_BODY]] ], [ [[DST]], [[ENTRY]] ] ++; CHECK-NEXT: [[DEC]] = add nsw i32 [[N_VAL]], -1 ++; CHECK-NEXT: [[SRC_VAL:%.*]] = load ptr, ptr [[SRC]], align 8 ++; CHECK-NEXT: [[SRC_VAL_NEXT:%.*]] = getelementptr inbounds i8, ptr [[SRC_VAL]], i64 1 ++; CHECK-NEXT: [[DST_VAL_NEXT]] = getelementptr inbounds i8, ptr [[DST_VAL]], i64 1 ++; CHECK-NEXT: store ptr [[SRC_VAL_NEXT]], ptr [[SRC]], align 8 ++; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[SRC_VAL]], align 1 ++; CHECK-NEXT: store i8 [[VAL]], ptr [[DST_VAL]], align 1 ++; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ++; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] ++; CHECK: while.end: ++; CHECK-NEXT: ret void ++; ++entry: ++ br label %while.body ++ ++while.body: ++ %n_val = phi i32 [ %dec, %while.body ], [ %n, %entry ] ++ %dst_val = phi ptr [ %dst_val.next, %while.body ], [ %dst, %entry ] ++ %dec = add nsw i32 %n_val, -1 ++ %src_val = load ptr, ptr %src, align 8 ++ %src_val.next = getelementptr inbounds i8, ptr %src_val, i64 1 ++ %dst_val.next = getelementptr inbounds i8, ptr %dst_val, i64 1 ++ store ptr %src_val.next, ptr %src, align 8 ++ %val = load i8, ptr %src_val, align 1 ++ store i8 %val, ptr %dst_val, align 1 ++ %tobool.not = icmp eq i32 %dec, 0 ++ br i1 %tobool.not, label %while.end, label %while.body ++ ++while.end: ++ ret void ++} ++ ++; A memcpy-like loop where the destination address is loaded from a pointer ++; FIXME: We could hoist the load of the destination address, but doing the ++; bounds check of the store through that pointer itself requires using the ++; hoisted load. ++define void @memcpy_load_dst(ptr %dst, ptr %src, i32 %n) { ++; CHECK-LABEL: define void @memcpy_load_dst ++; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[N:%.*]]) { ++; CHECK-NEXT: entry: ++; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ++; CHECK: while.body: ++; CHECK-NEXT: [[N_VAL:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[N]], [[ENTRY:%.*]] ] ++; CHECK-NEXT: [[SRC_VAL:%.*]] = phi ptr [ [[SRC_VAL_NEXT:%.*]], [[WHILE_BODY]] ], [ [[SRC]], [[ENTRY]] ] ++; CHECK-NEXT: [[DEC]] = add nsw i32 [[N_VAL]], -1 ++; CHECK-NEXT: [[DST_VAL:%.*]] = load ptr, ptr [[DST]], align 8 ++; CHECK-NEXT: [[SRC_VAL_NEXT]] = getelementptr inbounds i8, ptr [[SRC_VAL]], i64 1 ++; CHECK-NEXT: [[DST_VAL_NEXT:%.*]] = getelementptr inbounds i8, ptr [[DST_VAL]], i64 1 ++; CHECK-NEXT: store ptr [[DST_VAL_NEXT]], ptr [[DST]], align 8 ++; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[SRC_VAL]], align 1 ++; CHECK-NEXT: store i8 [[VAL]], ptr [[DST_VAL]], align 1 ++; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ++; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] ++; CHECK: while.end: ++; CHECK-NEXT: ret void ++; ++entry: ++ br label %while.body ++ ++while.body: ++ %n_val = phi i32 [ %dec, %while.body ], [ %n, %entry ] ++ %src_val = phi ptr [ %src_val.next, %while.body ], [ %src, %entry ] ++ %dec = add nsw i32 %n_val, -1 ++ %dst_val = load ptr, ptr %dst, align 8 ++ %src_val.next = getelementptr inbounds i8, ptr %src_val, i64 1 ++ %dst_val.next = getelementptr inbounds i8, ptr %dst_val, i64 1 ++ store ptr %dst_val.next, ptr %dst, align 8 ++ %val = load i8, ptr %src_val, align 1 ++ store i8 %val, ptr %dst_val, align 1 ++ %tobool.not = icmp eq i32 %dec, 0 ++ br i1 %tobool.not, label %while.end, label %while.body ++ ++while.end: ++ ret void ++} ++ ++; A memcpy-like loop where both the source and destination pointers are loaded from pointers ++; FIXME: We could hoist the loads of both addresses, but doing the bounds check ++; of the store through the destination address itself requires using the hoisted ++; load. ++define void @memcpy_load_src_dst(ptr %dst, ptr %src, i32 %n) { ++; CHECK-LABEL: define void @memcpy_load_src_dst ++; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[N:%.*]]) { ++; CHECK-NEXT: entry: ++; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ++; CHECK: while.body: ++; CHECK-NEXT: [[N_VAL:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[N]], [[ENTRY:%.*]] ] ++; CHECK-NEXT: [[DEC]] = add nsw i32 [[N_VAL]], -1 ++; CHECK-NEXT: [[SRC_VAL:%.*]] = load ptr, ptr [[SRC]], align 8 ++; CHECK-NEXT: [[DST_VAL:%.*]] = load ptr, ptr [[DST]], align 8 ++; CHECK-NEXT: [[SRC_VAL_NEXT:%.*]] = getelementptr inbounds i8, ptr [[SRC_VAL]], i64 1 ++; CHECK-NEXT: [[DST_VAL_NEXT:%.*]] = getelementptr inbounds i8, ptr [[DST_VAL]], i64 1 ++; CHECK-NEXT: store ptr [[SRC_VAL_NEXT]], ptr [[SRC]], align 8 ++; CHECK-NEXT: store ptr [[DST_VAL_NEXT]], ptr [[DST]], align 8 ++; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[SRC_VAL]], align 1 ++; CHECK-NEXT: store i8 [[VAL]], ptr [[DST_VAL]], align 1 ++; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ++; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] ++; CHECK: while.end: ++; CHECK-NEXT: ret void ++; ++entry: ++ br label %while.body ++ ++while.body: ++ %n_val = phi i32 [ %dec, %while.body ], [ %n, %entry ] ++ %dec = add nsw i32 %n_val, -1 ++ %src_val = load ptr, ptr %src, align 8 ++ %dst_val = load ptr, ptr %dst, align 8 ++ %src_val.next = getelementptr inbounds i8, ptr %src_val, i64 1 ++ %dst_val.next = getelementptr inbounds i8, ptr %dst_val, i64 1 ++ store ptr %src_val.next, ptr %src, align 8 ++ store ptr %dst_val.next, ptr %dst, align 8 ++ %val = load i8, ptr %src_val, align 1 ++ store i8 %val, ptr %dst_val, align 1 ++ %tobool.not = icmp eq i32 %dec, 0 ++ br i1 %tobool.not, label %while.end, label %while.body ++ ++while.end: ++ ret void ++} +diff --git a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll +index eec772c52bbb..8337a2d2c9c8 100644 +--- a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll ++++ b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll +@@ -57,23 +57,23 @@ define i32 @foo(ptr nocapture %var1, ptr nocapture readnone %var2, ptr nocapture + ; CHECK-NEXT: [[CMP2_LVER_ORIG:%.*]] = icmp ult i32 [[INC_LVER_ORIG]], [[ITR]] + ; CHECK-NEXT: br i1 [[CMP2_LVER_ORIG]], label [[FOR_BODY3_LVER_ORIG]], label [[FOR_INC11_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] + ; CHECK: for.body3.ph: +-; CHECK-NEXT: [[ARRAYIDX7_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4, !alias.scope !2, !noalias !2 ++; CHECK-NEXT: [[ARRAYIDX7_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4, !alias.scope !2 + ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] + ; CHECK: for.body3: + ; CHECK-NEXT: [[ADD86:%.*]] = phi i32 [ [[ARRAYIDX7_PROMOTED]], [[FOR_BODY3_PH]] ], [ [[ADD8:%.*]], [[FOR_BODY3]] ] + ; CHECK-NEXT: [[J_113:%.*]] = phi i32 [ [[J_016]], [[FOR_BODY3_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ] + ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[J_113]] to i64 + ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VAR1]], i64 [[IDXPROM]] +-; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4, !alias.scope !2, !noalias !2 ++; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX]], align 4, !alias.scope !5, !noalias !2 + ; CHECK-NEXT: [[ADD8]] = add nsw i32 [[ADD86]], [[ADD]] + ; CHECK-NEXT: [[INC]] = add nuw i32 [[J_113]], 1 + ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[INC]], [[ITR]] +-; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_INC11_LOOPEXIT_LOOPEXIT5:%.*]], !llvm.loop [[LOOP5:![0-9]+]] ++; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_INC11_LOOPEXIT_LOOPEXIT5:%.*]], !llvm.loop [[LOOP7:![0-9]+]] + ; CHECK: for.inc11.loopexit.loopexit: + ; CHECK-NEXT: br label [[FOR_INC11_LOOPEXIT:%.*]] + ; CHECK: for.inc11.loopexit.loopexit5: + ; CHECK-NEXT: [[ADD8_LCSSA:%.*]] = phi i32 [ [[ADD8]], [[FOR_BODY3]] ] +-; CHECK-NEXT: store i32 [[ADD8_LCSSA]], ptr [[ARRAYIDX7]], align 4, !alias.scope !2, !noalias !2 ++; CHECK-NEXT: store i32 [[ADD8_LCSSA]], ptr [[ARRAYIDX7]], align 4, !alias.scope !2 + ; CHECK-NEXT: br label [[FOR_INC11_LOOPEXIT]] + ; CHECK: for.inc11.loopexit: + ; CHECK-NEXT: br label [[FOR_INC11]] +diff --git a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll +index 22ca534be7ae..a31da2a212ea 100644 +--- a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll ++++ b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll +@@ -9,7 +9,7 @@ + ; + ; CHECK: for.cond1.for.inc17_crit_edge.us.loopexit5: ; preds = %for.body3.us + ; CHECK-NEXT: %add14.us.lcssa = phi float [ %add14.us, %for.body3.us ] +-; CHECK-NEXT: store float %add14.us.lcssa, ptr %arrayidx.us, align 4, !alias.scope !0, !noalias !0 ++; CHECK-NEXT: store float %add14.us.lcssa, ptr %arrayidx.us, align 4, !alias.scope !3 + ; CHECK-NEXT: br label %for.cond1.for.inc17_crit_edge.us + ; + define i32 @foo(ptr nocapture %var2, ptr nocapture readonly %var3, i32 %itr) #0 { +-- +2.43.0 + diff --git a/llvm.spec b/llvm.spec index 94a9695eb500c66d1b0094e4cfb6b281cd7d90de..b74acfc9bb125cb1fcd8326dc3da2b4c07b10240 100644 --- a/llvm.spec +++ b/llvm.spec @@ -154,7 +154,7 @@ Name: llvm Name: llvm-toolset-%{maj_ver} %endif Version: %{maj_ver}.%{min_ver}.%{patch_ver} -Release: 51 +Release: 52 Summary: The Low Level Virtual Machine License: NCSA @@ -179,6 +179,11 @@ Patch0005: 0005-Fix-for-building-autotuner-with-mlir.patch Patch0006: 0006-backport-mlir-Make-it-possible-to-build-a-DenseResou.patch Patch0007: 0007-backport-mlir-Add-Python-bindings-for-DenseResourceE.patch Patch0008: 0008-Propeller-bugfix-for-MachineBasicBlock-hash-set.patch +Patch0009: 0009-Fix-compilation-error.patch +Patch0010: 0010-NFC-Fix-no-plt-test-cases.patch +Patch0011: 0011-LoopDataPrefetch-Remove-preserved-analysis-info.patch +Patch0012: 0012-AArch64-Fix-disable-lse-not-working.patch +Patch0013: 0013-LoopVersioningLICM-Only-mark-pointers-with-generated.patch BuildRequires: gcc BuildRequires: gcc-c++ @@ -1676,7 +1681,7 @@ reset_test_opts #region Test LLVM reset_test_opts # Xfail testing of update utility tools -export LIT_XFAIL="tools/UpdateTestChecks;CodeGen/Hexagon/loop-prefetch.ll" +export LIT_XFAIL="tools/UpdateTestChecks" %build_tool %cmake_target_opts check-llvm #endregion Test LLVM @@ -2895,6 +2900,9 @@ fi #endregion files %changelog +* Tue Sep 09 2025 eastb233 -17.0.6-52 +- sync several bugfix patches from openeuler/llvm-project + * Tue Sep 02 2025 Xu Jin - 17.0.6-51 - remove rpath for openmp library