Skip to content

[SLPVectorizer] Add reduction of integer not vectorized #55693

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
nikic opened this issue May 25, 2022 · 4 comments
Open

[SLPVectorizer] Add reduction of integer not vectorized #55693

nikic opened this issue May 25, 2022 · 4 comments

Comments

@nikic
Copy link
Contributor

nikic commented May 25, 2022

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" 
target triple = "x86_64-unknown-linux-gnu"
  
define i16 @test(ptr %x) { 
start:
  %_51 = load i64, ptr %x, align 8
  %_4.i.i.i.sroa.4.16.extract.trunc = trunc i64 %_51 to i16
  %0 = and i16 %_4.i.i.i.sroa.4.16.extract.trunc, 255
  %1 = trunc i64 %_51 to i16
  %2 = lshr i16 %1, 8
  %_6.0.i.i.i.i.i.i.i.i.i.1 = add nuw nsw i16 %0, %2
  %_4.i.i.i.sroa.4.18.extract.shift = lshr i64 %_51, 16
  %_4.i.i.i.sroa.4.18.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.18.extract.shift to i16
  %3 = and i16 %_4.i.i.i.sroa.4.18.extract.trunc, 255
  %_6.0.i.i.i.i.i.i.i.i.i.2 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.1, %3
  %_4.i.i.i.sroa.4.19.extract.shift = lshr i64 %_51, 24
  %_4.i.i.i.sroa.4.19.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.19.extract.shift to i16
  %4 = and i16 %_4.i.i.i.sroa.4.19.extract.trunc, 255
  %_6.0.i.i.i.i.i.i.i.i.i.3 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.2, %4
  %_4.i.i.i.sroa.4.20.extract.shift = lshr i64 %_51, 32
  %_4.i.i.i.sroa.4.20.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.20.extract.shift to i16
  %5 = and i16 %_4.i.i.i.sroa.4.20.extract.trunc, 255
  %_6.0.i.i.i.i.i.i.i.i.i.4 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.3, %5
  %_4.i.i.i.sroa.4.21.extract.shift = lshr i64 %_51, 40
  %_4.i.i.i.sroa.4.21.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.21.extract.shift to i16
  %6 = and i16 %_4.i.i.i.sroa.4.21.extract.trunc, 255
  %_6.0.i.i.i.i.i.i.i.i.i.5 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.4, %6
  %_4.i.i.i.sroa.4.22.extract.shift = lshr i64 %_51, 48
  %_4.i.i.i.sroa.4.22.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.22.extract.shift to i16 
  %7 = and i16 %_4.i.i.i.sroa.4.22.extract.trunc, 255
  %_6.0.i.i.i.i.i.i.i.i.i.6 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.5, %7
  %_4.i.i.i.sroa.4.23.extract.shift = lshr i64 %_51, 56
  %_4.i.i.i.sroa.4.23.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.23.extract.shift to i16
  %_6.0.i.i.i.i.i.i.i.i.i.7 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.6, %_4.i.i.i.sroa.4.23.extract.trunc
  ret i16 %_6.0.i.i.i.i.i.i.i.i.i.7
} 

This could converted into a zext <8 x i8> to <8 x i16> followed by vector.reduce.add, but currently isn't. (I expect this to be profitable based on psadbw, but I'm not particularly familiar with this.)

@nikic
Copy link
Contributor Author

nikic commented May 25, 2022

@alexey-bataev Just to double check, SLPVectorizer currently doesn't handle "vectors" that are represented as integers (with "extracts" done via lshr+trunc) at all, right?

@alexey-bataev
Copy link
Member

alexey-bataev commented May 25, 2022

@alexey-bataev Just to double check, SLPVectorizer currently doesn't handle "vectors" that are represented as integers (with "extracts" done via lshr+trunc) at all, right?

Yes, it is not aware of this pattern yet. Plus, looks like there is still the problem with the copyable elements, because of this we have very small non-profitable tree

@RKSimon
Copy link
Collaborator

RKSimon commented May 25, 2022

We have a similar problem on #48223 where SROA leaves us with a lot of zext/shift/or/trunc ops to pack elements into i64 elements.

@dianqk
Copy link
Member

dianqk commented Sep 21, 2023

This is another similar IR.

; ModuleID = 'simd_wide_sum.e3e91bc971ec0fa5-cgu.0'
source_filename = "simd_wide_sum.e3e91bc971ec0fa5-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(argmem: read) uwtable
define noundef i16 @wider_reduce_into_iter(ptr noalias nocapture noundef readonly align 8 dereferenceable(8) %x) unnamed_addr #0 personality ptr @rust_eh_personality {
start:
  %0 = load i64, ptr %x, align 8
  %self.sroa.4.16.extract.trunc = trunc i64 %0 to i16
  %_0.i.i.i.i.i.i.i.i.i = and i16 %self.sroa.4.16.extract.trunc, 255
  %1 = trunc i64 %0 to i16
  %2 = lshr i16 %1, 8
  %_4.0.i.i.i.i.i.i.i.i.1 = add nuw nsw i16 %_0.i.i.i.i.i.i.i.i.i, %2
  %self.sroa.4.18.extract.shift = lshr i64 %0, 16
  %self.sroa.4.18.extract.trunc = trunc i64 %self.sroa.4.18.extract.shift to i16
  %_0.i.i.i.i.i.i.i.i.i.2 = and i16 %self.sroa.4.18.extract.trunc, 255
  %_4.0.i.i.i.i.i.i.i.i.2 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.1, %_0.i.i.i.i.i.i.i.i.i.2
  %self.sroa.4.19.extract.shift = lshr i64 %0, 24
  %self.sroa.4.19.extract.trunc = trunc i64 %self.sroa.4.19.extract.shift to i16
  %_0.i.i.i.i.i.i.i.i.i.3 = and i16 %self.sroa.4.19.extract.trunc, 255
  %_4.0.i.i.i.i.i.i.i.i.3 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.2, %_0.i.i.i.i.i.i.i.i.i.3
  %self.sroa.4.20.extract.shift = lshr i64 %0, 32
  %self.sroa.4.20.extract.trunc = trunc i64 %self.sroa.4.20.extract.shift to i16
  %_0.i.i.i.i.i.i.i.i.i.4 = and i16 %self.sroa.4.20.extract.trunc, 255
  %_4.0.i.i.i.i.i.i.i.i.4 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.3, %_0.i.i.i.i.i.i.i.i.i.4
  %self.sroa.4.21.extract.shift = lshr i64 %0, 40
  %self.sroa.4.21.extract.trunc = trunc i64 %self.sroa.4.21.extract.shift to i16
  %_0.i.i.i.i.i.i.i.i.i.5 = and i16 %self.sroa.4.21.extract.trunc, 255
  %_4.0.i.i.i.i.i.i.i.i.5 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.4, %_0.i.i.i.i.i.i.i.i.i.5
  %self.sroa.4.22.extract.shift = lshr i64 %0, 48
  %self.sroa.4.22.extract.trunc = trunc i64 %self.sroa.4.22.extract.shift to i16
  %_0.i.i.i.i.i.i.i.i.i.6 = and i16 %self.sroa.4.22.extract.trunc, 255
  %_4.0.i.i.i.i.i.i.i.i.6 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.5, %_0.i.i.i.i.i.i.i.i.i.6
  %self.sroa.4.23.extract.shift = lshr i64 %0, 56
  %self.sroa.4.23.extract.trunc = trunc i64 %self.sroa.4.23.extract.shift to i16
  %_4.0.i.i.i.i.i.i.i.i.7 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.6, %self.sroa.4.23.extract.trunc
  ret i16 %_4.0.i.i.i.i.i.i.i.i.7
}

; Function Attrs: nonlazybind uwtable
declare noundef i32 @rust_eh_personality(i32 noundef, i32 noundef, i64 noundef, ptr noundef, ptr noundef) unnamed_addr #1

attributes #0 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(argmem: read) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #1 = { nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 8, !"PIC Level", i32 2}
!1 = !{i32 2, !"RtLibUseGOT", i32 1}
!2 = !{!"rustc version 1.74.0-dev"}

https://llvm.godbolt.org/z/Y4b3x5x9z is a similar IR but gets vectorized in LoopVectorizePass.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

6 participants