[SLPVectorizer] Add reduction of integer not vectorized #55693

nikic · 2022-05-25T10:01:19Z

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" 
target triple = "x86_64-unknown-linux-gnu"
  
define i16 @test(ptr %x) { 
start:
  %_51 = load i64, ptr %x, align 8
  %_4.i.i.i.sroa.4.16.extract.trunc = trunc i64 %_51 to i16
  %0 = and i16 %_4.i.i.i.sroa.4.16.extract.trunc, 255
  %1 = trunc i64 %_51 to i16
  %2 = lshr i16 %1, 8
  %_6.0.i.i.i.i.i.i.i.i.i.1 = add nuw nsw i16 %0, %2
  %_4.i.i.i.sroa.4.18.extract.shift = lshr i64 %_51, 16
  %_4.i.i.i.sroa.4.18.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.18.extract.shift to i16
  %3 = and i16 %_4.i.i.i.sroa.4.18.extract.trunc, 255
  %_6.0.i.i.i.i.i.i.i.i.i.2 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.1, %3
  %_4.i.i.i.sroa.4.19.extract.shift = lshr i64 %_51, 24
  %_4.i.i.i.sroa.4.19.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.19.extract.shift to i16
  %4 = and i16 %_4.i.i.i.sroa.4.19.extract.trunc, 255
  %_6.0.i.i.i.i.i.i.i.i.i.3 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.2, %4
  %_4.i.i.i.sroa.4.20.extract.shift = lshr i64 %_51, 32
  %_4.i.i.i.sroa.4.20.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.20.extract.shift to i16
  %5 = and i16 %_4.i.i.i.sroa.4.20.extract.trunc, 255
  %_6.0.i.i.i.i.i.i.i.i.i.4 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.3, %5
  %_4.i.i.i.sroa.4.21.extract.shift = lshr i64 %_51, 40
  %_4.i.i.i.sroa.4.21.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.21.extract.shift to i16
  %6 = and i16 %_4.i.i.i.sroa.4.21.extract.trunc, 255
  %_6.0.i.i.i.i.i.i.i.i.i.5 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.4, %6
  %_4.i.i.i.sroa.4.22.extract.shift = lshr i64 %_51, 48
  %_4.i.i.i.sroa.4.22.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.22.extract.shift to i16 
  %7 = and i16 %_4.i.i.i.sroa.4.22.extract.trunc, 255
  %_6.0.i.i.i.i.i.i.i.i.i.6 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.5, %7
  %_4.i.i.i.sroa.4.23.extract.shift = lshr i64 %_51, 56
  %_4.i.i.i.sroa.4.23.extract.trunc = trunc i64 %_4.i.i.i.sroa.4.23.extract.shift to i16
  %_6.0.i.i.i.i.i.i.i.i.i.7 = add nuw nsw i16 %_6.0.i.i.i.i.i.i.i.i.i.6, %_4.i.i.i.sroa.4.23.extract.trunc
  ret i16 %_6.0.i.i.i.i.i.i.i.i.i.7
}

This could converted into a zext <8 x i8> to <8 x i16> followed by vector.reduce.add, but currently isn't. (I expect this to be profitable based on psadbw, but I'm not particularly familiar with this.)

The text was updated successfully, but these errors were encountered:

nikic · 2022-05-25T13:17:18Z

@alexey-bataev Just to double check, SLPVectorizer currently doesn't handle "vectors" that are represented as integers (with "extracts" done via lshr+trunc) at all, right?

alexey-bataev · 2022-05-25T13:22:25Z

@alexey-bataev Just to double check, SLPVectorizer currently doesn't handle "vectors" that are represented as integers (with "extracts" done via lshr+trunc) at all, right?

Yes, it is not aware of this pattern yet. Plus, looks like there is still the problem with the copyable elements, because of this we have very small non-profitable tree

RKSimon · 2022-05-25T16:35:36Z

We have a similar problem on #48223 where SROA leaves us with a lot of zext/shift/or/trunc ops to pack elements into i64 elements.

dianqk · 2023-09-21T04:08:31Z

This is another similar IR.

; ModuleID = 'simd_wide_sum.e3e91bc971ec0fa5-cgu.0'
source_filename = "simd_wide_sum.e3e91bc971ec0fa5-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(argmem: read) uwtable
define noundef i16 @wider_reduce_into_iter(ptr noalias nocapture noundef readonly align 8 dereferenceable(8) %x) unnamed_addr #0 personality ptr @rust_eh_personality {
start:
  %0 = load i64, ptr %x, align 8
  %self.sroa.4.16.extract.trunc = trunc i64 %0 to i16
  %_0.i.i.i.i.i.i.i.i.i = and i16 %self.sroa.4.16.extract.trunc, 255
  %1 = trunc i64 %0 to i16
  %2 = lshr i16 %1, 8
  %_4.0.i.i.i.i.i.i.i.i.1 = add nuw nsw i16 %_0.i.i.i.i.i.i.i.i.i, %2
  %self.sroa.4.18.extract.shift = lshr i64 %0, 16
  %self.sroa.4.18.extract.trunc = trunc i64 %self.sroa.4.18.extract.shift to i16
  %_0.i.i.i.i.i.i.i.i.i.2 = and i16 %self.sroa.4.18.extract.trunc, 255
  %_4.0.i.i.i.i.i.i.i.i.2 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.1, %_0.i.i.i.i.i.i.i.i.i.2
  %self.sroa.4.19.extract.shift = lshr i64 %0, 24
  %self.sroa.4.19.extract.trunc = trunc i64 %self.sroa.4.19.extract.shift to i16
  %_0.i.i.i.i.i.i.i.i.i.3 = and i16 %self.sroa.4.19.extract.trunc, 255
  %_4.0.i.i.i.i.i.i.i.i.3 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.2, %_0.i.i.i.i.i.i.i.i.i.3
  %self.sroa.4.20.extract.shift = lshr i64 %0, 32
  %self.sroa.4.20.extract.trunc = trunc i64 %self.sroa.4.20.extract.shift to i16
  %_0.i.i.i.i.i.i.i.i.i.4 = and i16 %self.sroa.4.20.extract.trunc, 255
  %_4.0.i.i.i.i.i.i.i.i.4 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.3, %_0.i.i.i.i.i.i.i.i.i.4
  %self.sroa.4.21.extract.shift = lshr i64 %0, 40
  %self.sroa.4.21.extract.trunc = trunc i64 %self.sroa.4.21.extract.shift to i16
  %_0.i.i.i.i.i.i.i.i.i.5 = and i16 %self.sroa.4.21.extract.trunc, 255
  %_4.0.i.i.i.i.i.i.i.i.5 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.4, %_0.i.i.i.i.i.i.i.i.i.5
  %self.sroa.4.22.extract.shift = lshr i64 %0, 48
  %self.sroa.4.22.extract.trunc = trunc i64 %self.sroa.4.22.extract.shift to i16
  %_0.i.i.i.i.i.i.i.i.i.6 = and i16 %self.sroa.4.22.extract.trunc, 255
  %_4.0.i.i.i.i.i.i.i.i.6 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.5, %_0.i.i.i.i.i.i.i.i.i.6
  %self.sroa.4.23.extract.shift = lshr i64 %0, 56
  %self.sroa.4.23.extract.trunc = trunc i64 %self.sroa.4.23.extract.shift to i16
  %_4.0.i.i.i.i.i.i.i.i.7 = add nuw nsw i16 %_4.0.i.i.i.i.i.i.i.i.6, %self.sroa.4.23.extract.trunc
  ret i16 %_4.0.i.i.i.i.i.i.i.i.7
}

; Function Attrs: nonlazybind uwtable
declare noundef i32 @rust_eh_personality(i32 noundef, i32 noundef, i64 noundef, ptr noundef, ptr noundef) unnamed_addr #1

attributes #0 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(argmem: read) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }
attributes #1 = { nonlazybind uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 8, !"PIC Level", i32 2}
!1 = !{i32 2, !"RtLibUseGOT", i32 1}
!2 = !{!"rustc version 1.74.0-dev"}

https://llvm.godbolt.org/z/Y4b3x5x9z is a similar IR but gets vectorized in LoopVectorizePass.

github-actions bot added the new issue label May 25, 2022

EugeneZelenko added llvm:SLPVectorizer and removed new issue labels May 25, 2022

nikic mentioned this issue May 25, 2022

Enable MIR inlining rust-lang/rust#91743

Merged

6 tasks

nunoplopes added the missed-optimization label May 27, 2022

nikic mentioned this issue Sep 20, 2023

LoopFullUnrollPass failed due to constant inference in opaque pointer mode. #65763

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SLPVectorizer] Add reduction of integer not vectorized #55693

[SLPVectorizer] Add reduction of integer not vectorized #55693

nikic commented May 25, 2022 •

edited by VoltrexKeyva

Loading

nikic commented May 25, 2022

alexey-bataev commented May 25, 2022 •

edited by nikic

Loading

RKSimon commented May 25, 2022

dianqk commented Sep 21, 2023

[SLPVectorizer] Add reduction of integer not vectorized #55693

[SLPVectorizer] Add reduction of integer not vectorized #55693

Comments

nikic commented May 25, 2022 • edited by VoltrexKeyva Loading

nikic commented May 25, 2022

alexey-bataev commented May 25, 2022 • edited by nikic Loading

RKSimon commented May 25, 2022

dianqk commented Sep 21, 2023

nikic commented May 25, 2022 •

edited by VoltrexKeyva

Loading

alexey-bataev commented May 25, 2022 •

edited by nikic

Loading