// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-19856038 // Cuda compilation tools, release 7.5, V7.5.17 // Based on LLVM 3.4svn // .version 3.0 .target sm_20 .address_size 64 // .globl _Z8distanceiiii // elt_prod_conj$__cuda_local_var_44209_45_non_const_sfc has been demoted // elt_prod_conj$__cuda_local_var_44210_45_non_const_sc1 has been demoted // elt_prod_conj$__cuda_local_var_44211_45_non_const_sc2 has been demoted // elt_prod_conj_v2$__cuda_local_var_44242_45_non_const_sfc has been demoted // reduce_max_final$__cuda_local_var_44307_33_non_const_sdata has been demoted // reduce_max_final$__cuda_local_var_44308_30_non_const_idxData has been demoted // reduce_max_main$__cuda_local_var_44442_33_non_const_sdata has been demoted // reduce_max_main$__cuda_local_var_44443_30_non_const_idxData has been demoted // reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow has been demoted // reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol has been demoted // reduce_max_filter_final$__cuda_local_var_44585_30_non_const_smaxesVal has been demoted // reduce_max_filter_final$__cuda_local_var_44586_33_non_const_sdata has been demoted // reduce_max_filter_final$__cuda_local_var_44587_30_non_const_idxData has been demoted // reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow has been demoted // reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol has been demoted // reduce_max_filter_main$__cuda_local_var_44797_30_non_const_smaxesVal has been demoted // reduce_max_filter_main$__cuda_local_var_44798_33_non_const_sdata has been demoted // reduce_max_filter_main$__cuda_local_var_44799_30_non_const_idxData has been demoted // elt_prod_conjf$__cuda_local_var_45052_39_non_const_sfc has been demoted // elt_prod_conjf$__cuda_local_var_45053_39_non_const_sc1 has been demoted // elt_prod_conjf$__cuda_local_var_45054_39_non_const_sc2 has been demoted // elt_prod_conj_v2f$__cuda_local_var_45085_39_non_const_sfc has been demoted // reduce_max_finalf$__cuda_local_var_45150_32_non_const_sdata has been demoted // reduce_max_finalf$__cuda_local_var_45151_30_non_const_idxData has been demoted // reduce_max_mainf$__cuda_local_var_45285_32_non_const_sdata has been demoted // reduce_max_mainf$__cuda_local_var_45286_30_non_const_idxData has been demoted // reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow has been demoted // reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol has been demoted // reduce_max_filter_finalf$__cuda_local_var_45427_30_non_const_smaxesVal has been demoted // reduce_max_filter_finalf$__cuda_local_var_45428_32_non_const_sdata has been demoted // reduce_max_filter_finalf$__cuda_local_var_45429_30_non_const_idxData has been demoted // reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow has been demoted // reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol has been demoted // reduce_max_filter_mainf$__cuda_local_var_45638_30_non_const_smaxesVal has been demoted // reduce_max_filter_mainf$__cuda_local_var_45639_32_non_const_sdata has been demoted // reduce_max_filter_mainf$__cuda_local_var_45640_30_non_const_idxData has been demoted .visible .func (.param .b64 func_retval0) _Z8distanceiiii( .param .b32 _Z8distanceiiii_param_0, .param .b32 _Z8distanceiiii_param_1, .param .b32 _Z8distanceiiii_param_2, .param .b32 _Z8distanceiiii_param_3 ) { .reg .b32 %r<7>; .reg .f64 %fd<5>; ld.param.u32 %r1, [_Z8distanceiiii_param_0]; ld.param.u32 %r2, [_Z8distanceiiii_param_1]; ld.param.u32 %r3, [_Z8distanceiiii_param_2]; ld.param.u32 %r4, [_Z8distanceiiii_param_3]; sub.s32 %r5, %r1, %r2; cvt.rn.f64.s32 %fd1, %r5; sub.s32 %r6, %r3, %r4; cvt.rn.f64.s32 %fd2, %r6; mul.f64 %fd3, %fd2, %fd2; fma.rn.f64 %fd4, %fd1, %fd1, %fd3; st.param.f64 [func_retval0+0], %fd4; ret; } // .globl _Z13checkDistancePiS_iii .visible .func (.param .b32 func_retval0) _Z13checkDistancePiS_iii( .param .b64 _Z13checkDistancePiS_iii_param_0, .param .b64 _Z13checkDistancePiS_iii_param_1, .param .b32 _Z13checkDistancePiS_iii_param_2, .param .b32 _Z13checkDistancePiS_iii_param_3, .param .b32 _Z13checkDistancePiS_iii_param_4 ) { .reg .pred %p<5>; .reg .b16 %rs<7>; .reg .b32 %r<13>; .reg .b64 %rd<7>; ld.param.u64 %rd1, [_Z13checkDistancePiS_iii_param_0]; ld.param.u64 %rd2, [_Z13checkDistancePiS_iii_param_1]; ld.param.u32 %r5, [_Z13checkDistancePiS_iii_param_2]; ld.param.u32 %r7, [_Z13checkDistancePiS_iii_param_3]; ld.param.u32 %r8, [_Z13checkDistancePiS_iii_param_4]; div.s32 %r1, %r7, %r8; rem.s32 %r2, %r7, %r8; mov.u16 %rs2, 1; mov.u32 %r12, 0; setp.lt.s32 %p1, %r5, 1; mov.u16 %rs6, %rs2; @%p1 bra BB1_4; BB1_1: mul.wide.s32 %rd3, %r12, 4; add.s64 %rd4, %rd1, %rd3; ld.u32 %r9, [%rd4]; setp.ne.s32 %p2, %r9, %r1; @%p2 bra BB1_3; add.s64 %rd6, %rd2, %rd3; ld.u32 %r10, [%rd6]; setp.eq.s32 %p3, %r10, %r2; mov.u16 %rs3, 0; mov.u16 %rs6, %rs3; @%p3 bra BB1_4; BB1_3: add.s32 %r12, %r12, 1; setp.lt.s32 %p4, %r12, %r5; mov.u16 %rs5, %rs2; mov.u16 %rs6, %rs5; @%p4 bra BB1_1; BB1_4: cvt.u32.u16 %r11, %rs6; st.param.b32 [func_retval0+0], %r11; ret; } // .globl _Z13checkDistancePViS0_iii .visible .func (.param .b32 func_retval0) _Z13checkDistancePViS0_iii( .param .b64 _Z13checkDistancePViS0_iii_param_0, .param .b64 _Z13checkDistancePViS0_iii_param_1, .param .b32 _Z13checkDistancePViS0_iii_param_2, .param .b32 _Z13checkDistancePViS0_iii_param_3, .param .b32 _Z13checkDistancePViS0_iii_param_4 ) { .reg .pred %p<5>; .reg .b16 %rs<7>; .reg .b32 %r<13>; .reg .b64 %rd<7>; ld.param.u64 %rd1, [_Z13checkDistancePViS0_iii_param_0]; ld.param.u64 %rd2, [_Z13checkDistancePViS0_iii_param_1]; ld.param.u32 %r5, [_Z13checkDistancePViS0_iii_param_2]; ld.param.u32 %r7, [_Z13checkDistancePViS0_iii_param_3]; ld.param.u32 %r8, [_Z13checkDistancePViS0_iii_param_4]; div.s32 %r1, %r7, %r8; rem.s32 %r2, %r7, %r8; mov.u16 %rs2, 1; mov.u32 %r12, 0; setp.lt.s32 %p1, %r5, 1; mov.u16 %rs6, %rs2; @%p1 bra BB2_4; BB2_1: mul.wide.s32 %rd3, %r12, 4; add.s64 %rd4, %rd1, %rd3; ld.volatile.u32 %r9, [%rd4]; setp.ne.s32 %p2, %r9, %r1; @%p2 bra BB2_3; add.s64 %rd6, %rd2, %rd3; ld.volatile.u32 %r10, [%rd6]; setp.eq.s32 %p3, %r10, %r2; mov.u16 %rs3, 0; mov.u16 %rs6, %rs3; @%p3 bra BB2_4; BB2_3: add.s32 %r12, %r12, 1; setp.lt.s32 %p4, %r12, %r5; mov.u16 %rs5, %rs2; mov.u16 %rs6, %rs5; @%p4 bra BB2_1; BB2_4: cvt.u32.u16 %r11, %rs6; st.param.b32 [func_retval0+0], %r11; ret; } // .globl _Z9distancefiiii .visible .func (.param .b32 func_retval0) _Z9distancefiiii( .param .b32 _Z9distancefiiii_param_0, .param .b32 _Z9distancefiiii_param_1, .param .b32 _Z9distancefiiii_param_2, .param .b32 _Z9distancefiiii_param_3 ) { .reg .f32 %f<5>; .reg .b32 %r<7>; ld.param.u32 %r1, [_Z9distancefiiii_param_0]; ld.param.u32 %r2, [_Z9distancefiiii_param_1]; ld.param.u32 %r3, [_Z9distancefiiii_param_2]; ld.param.u32 %r4, [_Z9distancefiiii_param_3]; sub.s32 %r5, %r1, %r2; cvt.rn.f32.s32 %f1, %r5; sub.s32 %r6, %r3, %r4; cvt.rn.f32.s32 %f2, %r6; mul.f32 %f3, %f2, %f2; fma.rn.f32 %f4, %f1, %f1, %f3; st.param.f32 [func_retval0+0], %f4; ret; } // .globl _Z14checkDistancefPiS_iii .visible .func (.param .b32 func_retval0) _Z14checkDistancefPiS_iii( .param .b64 _Z14checkDistancefPiS_iii_param_0, .param .b64 _Z14checkDistancefPiS_iii_param_1, .param .b32 _Z14checkDistancefPiS_iii_param_2, .param .b32 _Z14checkDistancefPiS_iii_param_3, .param .b32 _Z14checkDistancefPiS_iii_param_4 ) { .reg .pred %p<5>; .reg .b16 %rs<7>; .reg .b32 %r<13>; .reg .b64 %rd<7>; ld.param.u64 %rd1, [_Z14checkDistancefPiS_iii_param_0]; ld.param.u64 %rd2, [_Z14checkDistancefPiS_iii_param_1]; ld.param.u32 %r5, [_Z14checkDistancefPiS_iii_param_2]; ld.param.u32 %r7, [_Z14checkDistancefPiS_iii_param_3]; ld.param.u32 %r8, [_Z14checkDistancefPiS_iii_param_4]; div.s32 %r1, %r7, %r8; rem.s32 %r2, %r7, %r8; mov.u16 %rs2, 1; mov.u32 %r12, 0; setp.lt.s32 %p1, %r5, 1; mov.u16 %rs6, %rs2; @%p1 bra BB4_4; BB4_1: mul.wide.s32 %rd3, %r12, 4; add.s64 %rd4, %rd1, %rd3; ld.u32 %r9, [%rd4]; setp.ne.s32 %p2, %r9, %r1; @%p2 bra BB4_3; add.s64 %rd6, %rd2, %rd3; ld.u32 %r10, [%rd6]; setp.eq.s32 %p3, %r10, %r2; mov.u16 %rs3, 0; mov.u16 %rs6, %rs3; @%p3 bra BB4_4; BB4_3: add.s32 %r12, %r12, 1; setp.lt.s32 %p4, %r12, %r5; mov.u16 %rs5, %rs2; mov.u16 %rs6, %rs5; @%p4 bra BB4_1; BB4_4: cvt.u32.u16 %r11, %rs6; st.param.b32 [func_retval0+0], %r11; ret; } // .globl _Z14checkDistancefPViS0_iii .visible .func (.param .b32 func_retval0) _Z14checkDistancefPViS0_iii( .param .b64 _Z14checkDistancefPViS0_iii_param_0, .param .b64 _Z14checkDistancefPViS0_iii_param_1, .param .b32 _Z14checkDistancefPViS0_iii_param_2, .param .b32 _Z14checkDistancefPViS0_iii_param_3, .param .b32 _Z14checkDistancefPViS0_iii_param_4 ) { .reg .pred %p<5>; .reg .b16 %rs<7>; .reg .b32 %r<13>; .reg .b64 %rd<7>; ld.param.u64 %rd1, [_Z14checkDistancefPViS0_iii_param_0]; ld.param.u64 %rd2, [_Z14checkDistancefPViS0_iii_param_1]; ld.param.u32 %r5, [_Z14checkDistancefPViS0_iii_param_2]; ld.param.u32 %r7, [_Z14checkDistancefPViS0_iii_param_3]; ld.param.u32 %r8, [_Z14checkDistancefPViS0_iii_param_4]; div.s32 %r1, %r7, %r8; rem.s32 %r2, %r7, %r8; mov.u16 %rs2, 1; mov.u32 %r12, 0; setp.lt.s32 %p1, %r5, 1; mov.u16 %rs6, %rs2; @%p1 bra BB5_4; BB5_1: mul.wide.s32 %rd3, %r12, 4; add.s64 %rd4, %rd1, %rd3; ld.volatile.u32 %r9, [%rd4]; setp.ne.s32 %p2, %r9, %r1; @%p2 bra BB5_3; add.s64 %rd6, %rd2, %rd3; ld.volatile.u32 %r10, [%rd6]; setp.eq.s32 %p3, %r10, %r2; mov.u16 %rs3, 0; mov.u16 %rs6, %rs3; @%p3 bra BB5_4; BB5_3: add.s32 %r12, %r12, 1; setp.lt.s32 %p4, %r12, %r5; mov.u16 %rs5, %rs2; mov.u16 %rs6, %rs5; @%p4 bra BB5_1; BB5_4: cvt.u32.u16 %r11, %rs6; st.param.b32 [func_retval0+0], %r11; ret; } // .globl elt_prod_conj .visible .entry elt_prod_conj( .param .u64 elt_prod_conj_param_0, .param .u64 elt_prod_conj_param_1, .param .u64 elt_prod_conj_param_2, .param .u32 elt_prod_conj_param_3 ) { .reg .pred %p<10>; .reg .b32 %r<6>; .reg .f64 %fd<41>; .reg .b64 %rd<23>; // demoted variable .shared .align 16 .b8 elt_prod_conj$__cuda_local_var_44209_45_non_const_sfc[4096]; // demoted variable .shared .align 16 .b8 elt_prod_conj$__cuda_local_var_44210_45_non_const_sc1[4096]; // demoted variable .shared .align 16 .b8 elt_prod_conj$__cuda_local_var_44211_45_non_const_sc2[4096]; ld.param.u64 %rd6, [elt_prod_conj_param_0]; ld.param.u64 %rd7, [elt_prod_conj_param_1]; ld.param.u64 %rd8, [elt_prod_conj_param_2]; ld.param.u32 %r3, [elt_prod_conj_param_3]; mov.u32 %r4, %ctaid.x; shl.b32 %r5, %r4, 8; mov.u32 %r1, %tid.x; add.s32 %r2, %r5, %r1; setp.ge.s32 %p1, %r2, %r3; @%p1 bra BB6_5; cvta.to.global.u64 %rd9, %rd7; cvt.u64.u32 %rd1, %r1; mul.wide.u32 %rd10, %r1, 16; mov.u64 %rd11, elt_prod_conj$__cuda_local_var_44210_45_non_const_sc1; add.s64 %rd2, %rd11, %rd10; cvt.s64.s32 %rd3, %r2; mul.wide.s32 %rd12, %r2, 16; add.s64 %rd13, %rd9, %rd12; ld.global.v2.f64 {%fd6, %fd7}, [%rd13]; st.shared.v2.f64 [%rd2], {%fd6, %fd7}; mov.u64 %rd14, elt_prod_conj$__cuda_local_var_44211_45_non_const_sc2; add.s64 %rd4, %rd14, %rd10; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd16, %rd15, %rd12; ld.global.v2.f64 {%fd10, %fd11}, [%rd16]; st.shared.v2.f64 [%rd4], {%fd10, %fd11}; bar.sync 0; ld.shared.v2.f64 {%fd14, %fd15}, [%rd4]; ld.shared.v2.f64 {%fd18, %fd19}, [%rd2]; mul.f64 %fd22, %fd19, %fd15; fma.rn.f64 %fd39, %fd18, %fd14, %fd22; mul.f64 %fd23, %fd18, %fd15; mul.f64 %fd24, %fd19, %fd14; sub.f64 %fd2, %fd24, %fd23; shl.b64 %rd17, %rd1, 4; mov.u64 %rd18, elt_prod_conj$__cuda_local_var_44209_45_non_const_sfc; add.s64 %rd5, %rd18, %rd17; st.shared.v2.f64 [%rd5], {%fd39, %fd2}; abs.f64 %fd25, %fd39; abs.f64 %fd26, %fd2; setp.gt.f64 %p2, %fd25, %fd26; selp.f64 %fd27, %fd25, %fd26, %p2; selp.f64 %fd28, %fd26, %fd25, %p2; div.rn.f64 %fd29, %fd28, %fd27; fma.rn.f64 %fd30, %fd29, %fd29, 0d3FF0000000000000; sqrt.rn.f64 %fd31, %fd30; mul.f64 %fd32, %fd27, %fd31; setp.eq.f64 %p3, %fd27, 0d0000000000000000; setp.gt.f64 %p4, %fd27, 0d7FEFFFFFFFFFFFFF; or.pred %p5, %p3, %p4; setp.gt.f64 %p6, %fd28, 0d7FEFFFFFFFFFFFFF; or.pred %p7, %p5, %p6; add.f64 %fd33, %fd27, %fd28; selp.f64 %fd40, %fd33, %fd32, %p7; setp.eq.f64 %p8, %fd40, 0d0000000000000000; @%p8 bra BB6_3; abs.f64 %fd34, %fd40; setp.le.f64 %p9, %fd34, 0d7FF0000000000000; @%p9 bra BB6_4; BB6_3: mov.u64 %rd19, 4372995238176751616; st.shared.u64 [%rd5], %rd19; mov.f64 %fd40, 0d3CB0000000000000; mov.f64 %fd39, %fd40; BB6_4: cvta.to.global.u64 %rd20, %rd6; shl.b64 %rd21, %rd3, 4; add.s64 %rd22, %rd20, %rd21; div.rn.f64 %fd37, %fd2, %fd40; div.rn.f64 %fd38, %fd39, %fd40; st.global.v2.f64 [%rd22], {%fd38, %fd37}; BB6_5: ret; } // .globl elt_prod_conj_v2 .visible .entry elt_prod_conj_v2( .param .u64 elt_prod_conj_v2_param_0, .param .u64 elt_prod_conj_v2_param_1, .param .u64 elt_prod_conj_v2_param_2, .param .u32 elt_prod_conj_v2_param_3 ) { .reg .pred %p<5>; .reg .b32 %r<6>; .reg .f64 %fd<30>; .reg .b64 %rd<17>; // demoted variable .shared .align 16 .b8 elt_prod_conj_v2$__cuda_local_var_44242_45_non_const_sfc[4096]; ld.param.u64 %rd3, [elt_prod_conj_v2_param_0]; ld.param.u64 %rd4, [elt_prod_conj_v2_param_1]; ld.param.u64 %rd5, [elt_prod_conj_v2_param_2]; ld.param.u32 %r3, [elt_prod_conj_v2_param_3]; mov.u32 %r4, %ctaid.x; shl.b32 %r5, %r4, 8; mov.u32 %r1, %tid.x; add.s32 %r2, %r5, %r1; setp.ge.s32 %p1, %r2, %r3; @%p1 bra BB7_4; cvta.to.global.u64 %rd6, %rd4; cvt.s64.s32 %rd1, %r2; mul.wide.s32 %rd7, %r2, 16; add.s64 %rd8, %rd6, %rd7; cvta.to.global.u64 %rd9, %rd5; add.s64 %rd10, %rd9, %rd7; ld.global.v2.f64 {%fd6, %fd7}, [%rd10]; ld.global.v2.f64 {%fd10, %fd11}, [%rd8]; mul.f64 %fd14, %fd11, %fd7; mul.f64 %fd15, %fd10, %fd7; mul.f64 %fd16, %fd11, %fd6; mul.wide.u32 %rd11, %r1, 16; mov.u64 %rd12, elt_prod_conj_v2$__cuda_local_var_44242_45_non_const_sfc; add.s64 %rd2, %rd12, %rd11; sub.f64 %fd17, %fd16, %fd15; fma.rn.f64 %fd18, %fd10, %fd6, %fd14; st.shared.v2.f64 [%rd2], {%fd18, %fd17}; bar.sync 0; ld.shared.v2.f64 {%fd19, %fd20}, [%rd2]; mov.f64 %fd28, %fd19; mul.f64 %fd21, %fd20, %fd20; fma.rn.f64 %fd22, %fd19, %fd19, %fd21; sqrt.rn.f64 %fd29, %fd22; abs.f64 %fd23, %fd29; setp.gtu.f64 %p2, %fd23, 0d7FF0000000000000; setp.eq.f64 %p3, %fd29, 0d0000000000000000; or.pred %p4, %p2, %p3; @!%p4 bra BB7_3; bra.uni BB7_2; BB7_2: mov.u64 %rd13, 4372995238176751616; st.shared.u64 [%rd2], %rd13; mov.f64 %fd29, 0d3CB0000000000000; mov.f64 %fd28, %fd29; BB7_3: cvta.to.global.u64 %rd14, %rd3; shl.b64 %rd15, %rd1, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f64 %fd26, %fd20, %fd29; div.rn.f64 %fd27, %fd28, %fd29; st.global.v2.f64 [%rd16], {%fd27, %fd26}; BB7_4: ret; } // .globl elt_prod_conj_v3 .visible .entry elt_prod_conj_v3( .param .u64 elt_prod_conj_v3_param_0, .param .u64 elt_prod_conj_v3_param_1, .param .u64 elt_prod_conj_v3_param_2, .param .u32 elt_prod_conj_v3_param_3 ) { .reg .pred %p<12>; .reg .b32 %r<6>; .reg .f64 %fd<33>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [elt_prod_conj_v3_param_0]; ld.param.u64 %rd3, [elt_prod_conj_v3_param_1]; ld.param.u64 %rd4, [elt_prod_conj_v3_param_2]; ld.param.u32 %r2, [elt_prod_conj_v3_param_3]; mov.u32 %r3, %ctaid.x; shl.b32 %r4, %r3, 8; mov.u32 %r5, %tid.x; add.s32 %r1, %r4, %r5; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB8_4; cvta.to.global.u64 %rd5, %rd3; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd6, %r1, 16; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd4; add.s64 %rd9, %rd8, %rd6; ld.global.v2.f64 {%fd6, %fd7}, [%rd9]; ld.global.v2.f64 {%fd10, %fd11}, [%rd7]; mul.f64 %fd14, %fd11, %fd7; fma.rn.f64 %fd1, %fd10, %fd6, %fd14; mul.f64 %fd15, %fd10, %fd7; mul.f64 %fd16, %fd11, %fd6; sub.f64 %fd2, %fd16, %fd15; mul.f64 %fd17, %fd2, %fd2; fma.rn.f64 %fd18, %fd1, %fd1, %fd17; sqrt.rn.f64 %fd32, %fd18; abs.f64 %fd19, %fd32; setp.gtu.f64 %p2, %fd19, 0d7FF0000000000000; setp.eq.f64 %p3, %fd32, 0d0000000000000000; or.pred %p4, %p2, %p3; @!%p4 bra BB8_3; bra.uni BB8_2; BB8_2: abs.f64 %fd20, %fd1; abs.f64 %fd21, %fd2; setp.gt.f64 %p5, %fd20, %fd21; selp.f64 %fd22, %fd20, %fd21, %p5; selp.f64 %fd23, %fd21, %fd20, %p5; div.rn.f64 %fd24, %fd23, %fd22; fma.rn.f64 %fd25, %fd24, %fd24, 0d3FF0000000000000; sqrt.rn.f64 %fd26, %fd25; mul.f64 %fd27, %fd22, %fd26; setp.eq.f64 %p6, %fd22, 0d0000000000000000; setp.gt.f64 %p7, %fd22, 0d7FEFFFFFFFFFFFFF; or.pred %p8, %p6, %p7; setp.gt.f64 %p9, %fd23, 0d7FEFFFFFFFFFFFFF; or.pred %p10, %p8, %p9; add.f64 %fd28, %fd22, %fd23; selp.f64 %fd32, %fd28, %fd27, %p10; BB8_3: cvta.to.global.u64 %rd10, %rd2; setp.eq.f64 %p11, %fd32, 0d0000000000000000; selp.f64 %fd29, 0d3CB0000000000000, %fd32, %p11; shl.b64 %rd11, %rd1, 4; add.s64 %rd12, %rd10, %rd11; div.rn.f64 %fd30, %fd2, %fd29; div.rn.f64 %fd31, %fd1, %fd29; st.global.v2.f64 [%rd12], {%fd31, %fd30}; BB8_4: ret; } // .globl reduce_max_final .visible .entry reduce_max_final( .param .u64 reduce_max_final_param_0, .param .u64 reduce_max_final_param_1, .param .u64 reduce_max_final_param_2, .param .u32 reduce_max_final_param_3, .param .u32 reduce_max_final_param_4 ) { .reg .pred %p<29>; .reg .b32 %r<38>; .reg .f64 %fd<78>; .reg .b64 %rd<30>; // demoted variable .shared .align 8 .b8 reduce_max_final$__cuda_local_var_44307_33_non_const_sdata[2048]; // demoted variable .shared .align 4 .b8 reduce_max_final$__cuda_local_var_44308_30_non_const_idxData[1024]; ld.param.u64 %rd5, [reduce_max_final_param_0]; ld.param.u64 %rd6, [reduce_max_final_param_1]; ld.param.u64 %rd7, [reduce_max_final_param_2]; ld.param.u32 %r11, [reduce_max_final_param_3]; ld.param.u32 %r12, [reduce_max_final_param_4]; mov.u32 %r14, %tid.x; shl.b32 %r15, %r12, 1; mov.u32 %r16, %ctaid.x; mad.lo.s32 %r36, %r16, %r15, %r14; mov.f64 %fd74, 0d0000000000000000; mov.f64 %fd77, %fd74; setp.ge.u32 %p1, %r36, %r11; @%p1 bra BB9_7; BB9_1: mov.f64 %fd60, %fd77; mov.f64 %fd1, %fd60; cvta.to.global.u64 %rd8, %rd5; cvt.u64.u32 %rd1, %r36; mul.wide.u32 %rd9, %r36, 8; add.s64 %rd10, %rd8, %rd9; ld.global.f64 %fd2, [%rd10]; setp.geu.f64 %p2, %fd1, %fd2; mov.f64 %fd75, %fd1; @%p2 bra BB9_3; cvta.to.global.u64 %rd11, %rd7; shl.b64 %rd12, %rd1, 2; add.s64 %rd13, %rd11, %rd12; ld.global.u32 %r37, [%rd13]; mov.f64 %fd75, %fd2; BB9_3: mov.f64 %fd3, %fd75; add.s32 %r6, %r36, %r12; setp.ge.u32 %p3, %r6, %r11; mov.f64 %fd76, %fd3; @%p3 bra BB9_6; cvt.u64.u32 %rd2, %r6; mul.wide.u32 %rd15, %r6, 8; add.s64 %rd16, %rd8, %rd15; ld.global.f64 %fd4, [%rd16]; setp.geu.f64 %p4, %fd3, %fd4; mov.f64 %fd59, %fd3; mov.f64 %fd76, %fd59; @%p4 bra BB9_6; cvta.to.global.u64 %rd17, %rd7; shl.b64 %rd18, %rd2, 2; add.s64 %rd19, %rd17, %rd18; ld.global.u32 %r37, [%rd19]; mov.f64 %fd76, %fd4; BB9_6: mov.f64 %fd77, %fd76; mov.u32 %r18, %nctaid.x; mad.lo.s32 %r36, %r18, %r15, %r36; setp.lt.u32 %p5, %r36, %r11; mov.f64 %fd74, %fd77; @%p5 bra BB9_1; BB9_7: mov.f64 %fd72, %fd74; mul.wide.u32 %rd20, %r14, 8; mov.u64 %rd21, reduce_max_final$__cuda_local_var_44307_33_non_const_sdata; add.s64 %rd3, %rd21, %rd20; st.shared.f64 [%rd3], %fd72; mul.wide.u32 %rd22, %r14, 4; mov.u64 %rd23, reduce_max_final$__cuda_local_var_44308_30_non_const_idxData; add.s64 %rd4, %rd23, %rd22; st.shared.u32 [%rd4], %r37; bar.sync 0; setp.lt.s32 %p6, %r12, 512; @%p6 bra BB9_12; setp.gt.u32 %p7, %r14, 255; mov.f64 %fd73, %fd72; @%p7 bra BB9_11; ld.shared.f64 %fd7, [%rd3+2048]; setp.geu.f64 %p8, %fd72, %fd7; mov.f64 %fd57, %fd72; mov.f64 %fd73, %fd57; @%p8 bra BB9_11; st.shared.f64 [%rd3], %fd7; ld.shared.u32 %r21, [%rd4+1024]; st.shared.u32 [%rd4], %r21; mov.f64 %fd73, %fd7; BB9_11: mov.f64 %fd72, %fd73; bar.sync 0; BB9_12: mov.f64 %fd70, %fd72; setp.lt.s32 %p9, %r12, 256; @%p9 bra BB9_17; setp.gt.u32 %p10, %r14, 127; mov.f64 %fd71, %fd70; @%p10 bra BB9_16; ld.shared.f64 %fd10, [%rd3+1024]; setp.geu.f64 %p11, %fd70, %fd10; mov.f64 %fd53, %fd70; mov.f64 %fd71, %fd53; @%p11 bra BB9_16; st.shared.f64 [%rd3], %fd10; ld.shared.u32 %r23, [%rd4+512]; st.shared.u32 [%rd4], %r23; mov.f64 %fd71, %fd10; BB9_16: mov.f64 %fd70, %fd71; bar.sync 0; BB9_17: mov.f64 %fd68, %fd70; setp.lt.s32 %p12, %r12, 128; @%p12 bra BB9_22; setp.gt.u32 %p13, %r14, 63; mov.f64 %fd69, %fd68; @%p13 bra BB9_21; ld.shared.f64 %fd13, [%rd3+512]; setp.geu.f64 %p14, %fd68, %fd13; mov.f64 %fd49, %fd68; mov.f64 %fd69, %fd49; @%p14 bra BB9_21; st.shared.f64 [%rd3], %fd13; ld.shared.u32 %r25, [%rd4+256]; st.shared.u32 [%rd4], %r25; mov.f64 %fd69, %fd13; BB9_21: mov.f64 %fd68, %fd69; bar.sync 0; BB9_22: mov.f64 %fd67, %fd68; setp.gt.u32 %p15, %r14, 31; @%p15 bra BB9_42; setp.lt.s32 %p16, %r12, 64; @%p16 bra BB9_26; ld.volatile.shared.f64 %fd28, [%rd3+256]; setp.geu.f64 %p17, %fd67, %fd28; @%p17 bra BB9_26; ld.volatile.shared.f64 %fd67, [%rd3+256]; st.volatile.shared.f64 [%rd3], %fd67; ld.volatile.shared.u32 %r27, [%rd4+128]; st.volatile.shared.u32 [%rd4], %r27; BB9_26: mov.f64 %fd66, %fd67; setp.lt.s32 %p18, %r12, 32; @%p18 bra BB9_29; ld.volatile.shared.f64 %fd29, [%rd3+128]; setp.geu.f64 %p19, %fd66, %fd29; @%p19 bra BB9_29; ld.volatile.shared.f64 %fd66, [%rd3+128]; st.volatile.shared.f64 [%rd3], %fd66; ld.volatile.shared.u32 %r28, [%rd4+64]; st.volatile.shared.u32 [%rd4], %r28; BB9_29: mov.f64 %fd65, %fd66; setp.lt.s32 %p20, %r12, 16; @%p20 bra BB9_32; ld.volatile.shared.f64 %fd30, [%rd3+64]; setp.geu.f64 %p21, %fd65, %fd30; @%p21 bra BB9_32; ld.volatile.shared.f64 %fd65, [%rd3+64]; st.volatile.shared.f64 [%rd3], %fd65; ld.volatile.shared.u32 %r29, [%rd4+32]; st.volatile.shared.u32 [%rd4], %r29; BB9_32: mov.f64 %fd64, %fd65; setp.lt.s32 %p22, %r12, 8; @%p22 bra BB9_35; ld.volatile.shared.f64 %fd31, [%rd3+32]; setp.geu.f64 %p23, %fd64, %fd31; @%p23 bra BB9_35; ld.volatile.shared.f64 %fd64, [%rd3+32]; st.volatile.shared.f64 [%rd3], %fd64; ld.volatile.shared.u32 %r30, [%rd4+16]; st.volatile.shared.u32 [%rd4], %r30; BB9_35: mov.f64 %fd63, %fd64; setp.lt.s32 %p24, %r12, 4; @%p24 bra BB9_38; ld.volatile.shared.f64 %fd32, [%rd3+16]; setp.geu.f64 %p25, %fd63, %fd32; @%p25 bra BB9_38; ld.volatile.shared.f64 %fd63, [%rd3+16]; st.volatile.shared.f64 [%rd3], %fd63; ld.volatile.shared.u32 %r31, [%rd4+8]; st.volatile.shared.u32 [%rd4], %r31; BB9_38: setp.lt.s32 %p26, %r12, 2; @%p26 bra BB9_41; ld.volatile.shared.f64 %fd33, [%rd3+8]; setp.geu.f64 %p27, %fd63, %fd33; @%p27 bra BB9_41; ld.volatile.shared.f64 %fd34, [%rd3+8]; st.volatile.shared.f64 [%rd3], %fd34; ld.volatile.shared.u32 %r32, [%rd4+4]; st.volatile.shared.u32 [%rd4], %r32; BB9_41: bar.sync 0; BB9_42: setp.ne.s32 %p28, %r14, 0; @%p28 bra BB9_44; ld.shared.f64 %fd35, [reduce_max_final$__cuda_local_var_44307_33_non_const_sdata]; cvta.to.global.u64 %rd24, %rd6; mul.wide.u32 %rd25, %r16, 8; add.s64 %rd26, %rd24, %rd25; st.global.f64 [%rd26], %fd35; ld.shared.u32 %r35, [reduce_max_final$__cuda_local_var_44308_30_non_const_idxData]; cvta.to.global.u64 %rd27, %rd7; mul.wide.u32 %rd28, %r16, 4; add.s64 %rd29, %rd27, %rd28; st.global.u32 [%rd29], %r35; BB9_44: ret; } // .globl reduce_max_main .visible .entry reduce_max_main( .param .u64 reduce_max_main_param_0, .param .u64 reduce_max_main_param_1, .param .u64 reduce_max_main_param_2, .param .u32 reduce_max_main_param_3, .param .u32 reduce_max_main_param_4 ) { .reg .pred %p<29>; .reg .b32 %r<39>; .reg .f64 %fd<75>; .reg .b64 %rd<22>; // demoted variable .shared .align 8 .b8 reduce_max_main$__cuda_local_var_44442_33_non_const_sdata[2048]; // demoted variable .shared .align 4 .b8 reduce_max_main$__cuda_local_var_44443_30_non_const_idxData[1024]; ld.param.u64 %rd3, [reduce_max_main_param_0]; ld.param.u64 %rd4, [reduce_max_main_param_1]; ld.param.u64 %rd5, [reduce_max_main_param_2]; ld.param.u32 %r10, [reduce_max_main_param_3]; ld.param.u32 %r11, [reduce_max_main_param_4]; mov.u32 %r13, %tid.x; mov.u32 %r14, %ctaid.x; mad.lo.s32 %r33, %r14, %r11, %r13; mov.f64 %fd72, 0d0000000000000000; mov.f64 %fd73, %fd72; setp.ge.u32 %p1, %r33, %r10; @%p1 bra BB10_4; BB10_1: mov.f64 %fd1, %fd73; mov.u32 %r2, %r37; cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r33, 8; add.s64 %rd8, %rd6, %rd7; ld.global.f64 %fd27, [%rd8]; setp.lt.f64 %p2, %fd1, %fd27; selp.f64 %fd74, %fd27, %fd1, %p2; selp.b32 %r38, %r33, %r2, %p2; add.s32 %r5, %r33, %r11; setp.ge.u32 %p3, %r5, %r10; @%p3 bra BB10_3; mul.wide.u32 %rd10, %r5, 8; add.s64 %rd11, %rd6, %rd10; ld.global.f64 %fd28, [%rd11]; setp.lt.f64 %p4, %fd74, %fd28; selp.f64 %fd74, %fd28, %fd74, %p4; selp.b32 %r38, %r5, %r38, %p4; BB10_3: mov.f64 %fd73, %fd74; mov.u32 %r37, %r38; mov.u32 %r15, %nctaid.x; mad.lo.s32 %r33, %r15, %r11, %r33; setp.lt.u32 %p5, %r33, %r10; mov.u32 %r36, %r37; mov.f64 %fd72, %fd73; @%p5 bra BB10_1; BB10_4: mov.f64 %fd70, %fd72; mul.wide.u32 %rd12, %r13, 8; mov.u64 %rd13, reduce_max_main$__cuda_local_var_44442_33_non_const_sdata; add.s64 %rd1, %rd13, %rd12; st.shared.f64 [%rd1], %fd70; mul.wide.u32 %rd14, %r13, 4; mov.u64 %rd15, reduce_max_main$__cuda_local_var_44443_30_non_const_idxData; add.s64 %rd2, %rd15, %rd14; st.shared.u32 [%rd2], %r36; bar.sync 0; setp.lt.s32 %p6, %r11, 512; @%p6 bra BB10_9; setp.gt.u32 %p7, %r13, 255; mov.f64 %fd71, %fd70; @%p7 bra BB10_8; ld.shared.f64 %fd6, [%rd1+2048]; setp.geu.f64 %p8, %fd70, %fd6; mov.f64 %fd41, %fd70; mov.f64 %fd71, %fd41; @%p8 bra BB10_8; st.shared.f64 [%rd1], %fd6; ld.shared.u32 %r18, [%rd2+1024]; st.shared.u32 [%rd2], %r18; mov.f64 %fd71, %fd6; BB10_8: mov.f64 %fd70, %fd71; bar.sync 0; BB10_9: mov.f64 %fd68, %fd70; setp.lt.s32 %p9, %r11, 256; @%p9 bra BB10_14; setp.gt.u32 %p10, %r13, 127; mov.f64 %fd69, %fd68; @%p10 bra BB10_13; ld.shared.f64 %fd9, [%rd1+1024]; setp.geu.f64 %p11, %fd68, %fd9; mov.f64 %fd45, %fd68; mov.f64 %fd69, %fd45; @%p11 bra BB10_13; st.shared.f64 [%rd1], %fd9; ld.shared.u32 %r20, [%rd2+512]; st.shared.u32 [%rd2], %r20; mov.f64 %fd69, %fd9; BB10_13: mov.f64 %fd68, %fd69; bar.sync 0; BB10_14: mov.f64 %fd66, %fd68; setp.lt.s32 %p12, %r11, 128; @%p12 bra BB10_19; setp.gt.u32 %p13, %r13, 63; mov.f64 %fd67, %fd66; @%p13 bra BB10_18; ld.shared.f64 %fd12, [%rd1+512]; setp.geu.f64 %p14, %fd66, %fd12; mov.f64 %fd49, %fd66; mov.f64 %fd67, %fd49; @%p14 bra BB10_18; st.shared.f64 [%rd1], %fd12; ld.shared.u32 %r22, [%rd2+256]; st.shared.u32 [%rd2], %r22; mov.f64 %fd67, %fd12; BB10_18: mov.f64 %fd66, %fd67; bar.sync 0; BB10_19: mov.f64 %fd65, %fd66; setp.gt.u32 %p15, %r13, 31; @%p15 bra BB10_39; setp.lt.s32 %p16, %r11, 64; @%p16 bra BB10_23; ld.volatile.shared.f64 %fd29, [%rd1+256]; setp.geu.f64 %p17, %fd65, %fd29; @%p17 bra BB10_23; ld.volatile.shared.f64 %fd65, [%rd1+256]; st.volatile.shared.f64 [%rd1], %fd65; ld.volatile.shared.u32 %r24, [%rd2+128]; st.volatile.shared.u32 [%rd2], %r24; BB10_23: mov.f64 %fd64, %fd65; setp.lt.s32 %p18, %r11, 32; @%p18 bra BB10_26; ld.volatile.shared.f64 %fd30, [%rd1+128]; setp.geu.f64 %p19, %fd64, %fd30; @%p19 bra BB10_26; ld.volatile.shared.f64 %fd64, [%rd1+128]; st.volatile.shared.f64 [%rd1], %fd64; ld.volatile.shared.u32 %r25, [%rd2+64]; st.volatile.shared.u32 [%rd2], %r25; BB10_26: mov.f64 %fd63, %fd64; setp.lt.s32 %p20, %r11, 16; @%p20 bra BB10_29; ld.volatile.shared.f64 %fd31, [%rd1+64]; setp.geu.f64 %p21, %fd63, %fd31; @%p21 bra BB10_29; ld.volatile.shared.f64 %fd63, [%rd1+64]; st.volatile.shared.f64 [%rd1], %fd63; ld.volatile.shared.u32 %r26, [%rd2+32]; st.volatile.shared.u32 [%rd2], %r26; BB10_29: mov.f64 %fd62, %fd63; setp.lt.s32 %p22, %r11, 8; @%p22 bra BB10_32; ld.volatile.shared.f64 %fd32, [%rd1+32]; setp.geu.f64 %p23, %fd62, %fd32; @%p23 bra BB10_32; ld.volatile.shared.f64 %fd62, [%rd1+32]; st.volatile.shared.f64 [%rd1], %fd62; ld.volatile.shared.u32 %r27, [%rd2+16]; st.volatile.shared.u32 [%rd2], %r27; BB10_32: mov.f64 %fd61, %fd62; setp.lt.s32 %p24, %r11, 4; @%p24 bra BB10_35; ld.volatile.shared.f64 %fd33, [%rd1+16]; setp.geu.f64 %p25, %fd61, %fd33; @%p25 bra BB10_35; ld.volatile.shared.f64 %fd61, [%rd1+16]; st.volatile.shared.f64 [%rd1], %fd61; ld.volatile.shared.u32 %r28, [%rd2+8]; st.volatile.shared.u32 [%rd2], %r28; BB10_35: setp.lt.s32 %p26, %r11, 2; @%p26 bra BB10_38; ld.volatile.shared.f64 %fd34, [%rd1+8]; setp.geu.f64 %p27, %fd61, %fd34; @%p27 bra BB10_38; ld.volatile.shared.f64 %fd35, [%rd1+8]; st.volatile.shared.f64 [%rd1], %fd35; ld.volatile.shared.u32 %r29, [%rd2+4]; st.volatile.shared.u32 [%rd2], %r29; BB10_38: bar.sync 0; BB10_39: setp.ne.s32 %p28, %r13, 0; @%p28 bra BB10_41; ld.shared.f64 %fd36, [reduce_max_main$__cuda_local_var_44442_33_non_const_sdata]; cvta.to.global.u64 %rd16, %rd4; mul.wide.u32 %rd17, %r14, 8; add.s64 %rd18, %rd16, %rd17; st.global.f64 [%rd18], %fd36; ld.shared.u32 %r32, [reduce_max_main$__cuda_local_var_44443_30_non_const_idxData]; cvta.to.global.u64 %rd19, %rd5; mul.wide.u32 %rd20, %r14, 4; add.s64 %rd21, %rd19, %rd20; st.global.u32 [%rd21], %r32; BB10_41: ret; } // .globl reduce_max_filter_final .visible .entry reduce_max_filter_final( .param .u64 reduce_max_filter_final_param_0, .param .u64 reduce_max_filter_final_param_1, .param .u64 reduce_max_filter_final_param_2, .param .u32 reduce_max_filter_final_param_3, .param .u32 reduce_max_filter_final_param_4, .param .u32 reduce_max_filter_final_param_5, .param .u64 reduce_max_filter_final_param_6, .param .u32 reduce_max_filter_final_param_7 ) { .reg .pred %p<78>; .reg .b32 %r<173>; .reg .f64 %fd<102>; .reg .b64 %rd<134>; // demoted variable .shared .align 4 .b8 reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow[40]; // demoted variable .shared .align 4 .b8 reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol[40]; // demoted variable .shared .align 4 .b8 reduce_max_filter_final$__cuda_local_var_44585_30_non_const_smaxesVal[40]; // demoted variable .shared .align 8 .b8 reduce_max_filter_final$__cuda_local_var_44586_33_non_const_sdata[2048]; // demoted variable .shared .align 4 .b8 reduce_max_filter_final$__cuda_local_var_44587_30_non_const_idxData[1024]; ld.param.u64 %rd18, [reduce_max_filter_final_param_0]; ld.param.u64 %rd19, [reduce_max_filter_final_param_1]; ld.param.u64 %rd20, [reduce_max_filter_final_param_2]; ld.param.u32 %r67, [reduce_max_filter_final_param_3]; ld.param.u32 %r68, [reduce_max_filter_final_param_4]; ld.param.u32 %r69, [reduce_max_filter_final_param_5]; ld.param.u64 %rd21, [reduce_max_filter_final_param_6]; ld.param.u32 %r70, [reduce_max_filter_final_param_7]; mov.u32 %r71, %tid.x; setp.ge.u32 %p1, %r71, %r70; @%p1 bra BB11_2; cvta.to.global.u64 %rd22, %rd21; mul.wide.u32 %rd23, %r71, 4; add.s64 %rd24, %rd22, %rd23; ld.global.u32 %r73, [%rd24]; mov.u64 %rd25, reduce_max_filter_final$__cuda_local_var_44585_30_non_const_smaxesVal; add.s64 %rd26, %rd25, %rd23; st.shared.u32 [%rd26], %r73; div.u32 %r74, %r73, %r68; mov.u64 %rd27, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd28, %rd27, %rd23; st.shared.u32 [%rd28], %r74; rem.u32 %r75, %r73, %r68; mov.u64 %rd29, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd30, %rd29, %rd23; st.shared.u32 [%rd30], %r75; BB11_2: bar.sync 0; shl.b32 %r77, %r69, 1; mov.u32 %r78, %ctaid.x; mad.lo.s32 %r142, %r78, %r77, %r71; mov.f64 %fd95, 0d0000000000000000; setp.ge.u32 %p2, %r142, %r67; @%p2 bra BB11_22; setp.gt.s32 %p3, %r70, 0; mov.f64 %fd101, 0d0000000000000000; mov.f64 %fd98, %fd101; @%p3 bra BB11_10; bra.uni BB11_4; BB11_10: mov.f64 %fd75, %fd98; mov.f64 %fd6, %fd75; mov.u32 %r148, %r160; mov.u32 %r10, %r148; cvta.to.global.u64 %rd43, %rd18; cvt.u64.u32 %rd3, %r142; mul.wide.u32 %rd44, %r142, 8; add.s64 %rd45, %rd43, %rd44; ld.global.f64 %fd7, [%rd45]; setp.geu.f64 %p8, %fd6, %fd7; mov.u32 %r158, %r10; mov.f64 %fd96, %fd6; @%p8 bra BB11_15; cvta.to.global.u64 %rd46, %rd20; shl.b64 %rd47, %rd3, 2; add.s64 %rd48, %rd46, %rd47; ld.global.u32 %r12, [%rd48]; div.s32 %r13, %r12, %r68; rem.s32 %r14, %r12, %r68; mov.u32 %r143, 0; BB11_12: cvt.s64.s32 %rd4, %r143; mul.wide.s32 %rd49, %r143, 4; mov.u64 %rd50, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd51, %rd50, %rd49; ld.shared.u32 %r88, [%rd51]; setp.ne.s32 %p9, %r88, %r13; @%p9 bra BB11_14; shl.b64 %rd52, %rd4, 2; mov.u64 %rd53, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd54, %rd53, %rd52; ld.shared.u32 %r89, [%rd54]; setp.eq.s32 %p10, %r89, %r14; mov.u32 %r149, %r10; mov.u32 %r158, %r149; mov.f64 %fd76, %fd6; mov.f64 %fd96, %fd76; @%p10 bra BB11_15; BB11_14: add.s32 %r143, %r143, 1; setp.lt.s32 %p11, %r143, %r70; mov.u32 %r158, %r12; mov.f64 %fd96, %fd7; @%p11 bra BB11_12; BB11_15: mov.f64 %fd8, %fd96; mov.u32 %r17, %r158; add.s32 %r18, %r142, %r69; setp.ge.u32 %p12, %r18, %r67; mov.u32 %r159, %r17; mov.f64 %fd97, %fd8; @%p12 bra BB11_21; cvt.u64.u32 %rd5, %r18; mul.wide.u32 %rd56, %r18, 8; add.s64 %rd57, %rd43, %rd56; ld.global.f64 %fd9, [%rd57]; setp.geu.f64 %p13, %fd8, %fd9; mov.u32 %r146, %r17; mov.u32 %r159, %r146; mov.f64 %fd73, %fd8; mov.f64 %fd97, %fd73; @%p13 bra BB11_21; cvta.to.global.u64 %rd58, %rd20; shl.b64 %rd59, %rd5, 2; add.s64 %rd60, %rd58, %rd59; ld.global.u32 %r19, [%rd60]; div.s32 %r20, %r19, %r68; rem.s32 %r21, %r19, %r68; mov.u32 %r144, 0; BB11_18: cvt.s64.s32 %rd6, %r144; mul.wide.s32 %rd61, %r144, 4; mov.u64 %rd62, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd63, %rd62, %rd61; ld.shared.u32 %r91, [%rd63]; setp.ne.s32 %p14, %r91, %r20; @%p14 bra BB11_20; shl.b64 %rd64, %rd6, 2; mov.u64 %rd65, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd66, %rd65, %rd64; ld.shared.u32 %r92, [%rd66]; setp.eq.s32 %p15, %r92, %r21; mov.u32 %r147, %r17; mov.u32 %r159, %r147; mov.f64 %fd74, %fd8; mov.f64 %fd97, %fd74; @%p15 bra BB11_21; BB11_20: add.s32 %r144, %r144, 1; setp.lt.s32 %p16, %r144, %r70; mov.u32 %r159, %r19; mov.f64 %fd97, %fd9; @%p16 bra BB11_18; BB11_21: mov.f64 %fd98, %fd97; mov.u32 %r160, %r159; mov.u32 %r94, %nctaid.x; mad.lo.s32 %r142, %r94, %r77, %r142; setp.lt.u32 %p17, %r142, %r67; mov.u32 %r157, %r160; mov.f64 %fd95, %fd98; @%p17 bra BB11_10; bra.uni BB11_22; BB11_4: mov.f64 %fd81, %fd101; mov.f64 %fd1, %fd81; mov.u32 %r154, %r163; mov.u32 %r161, %r154; cvta.to.global.u64 %rd31, %rd18; cvt.u64.u32 %rd1, %r142; mul.wide.u32 %rd32, %r142, 8; add.s64 %rd33, %rd31, %rd32; ld.global.f64 %fd2, [%rd33]; setp.geu.f64 %p4, %fd1, %fd2; mov.f64 %fd99, %fd1; @%p4 bra BB11_6; cvta.to.global.u64 %rd34, %rd20; shl.b64 %rd35, %rd1, 2; add.s64 %rd36, %rd34, %rd35; ld.global.u32 %r161, [%rd36]; mov.f64 %fd99, %fd2; BB11_6: mov.f64 %fd3, %fd99; mov.u32 %r162, %r161; add.s32 %r6, %r142, %r69; setp.ge.u32 %p5, %r6, %r67; mov.f64 %fd100, %fd3; @%p5 bra BB11_9; cvt.u64.u32 %rd2, %r6; mul.wide.u32 %rd38, %r6, 8; add.s64 %rd39, %rd31, %rd38; ld.global.f64 %fd4, [%rd39]; setp.geu.f64 %p6, %fd3, %fd4; mov.f64 %fd80, %fd3; mov.f64 %fd100, %fd80; @%p6 bra BB11_9; cvta.to.global.u64 %rd40, %rd20; shl.b64 %rd41, %rd2, 2; add.s64 %rd42, %rd40, %rd41; ld.global.u32 %r162, [%rd42]; mov.f64 %fd100, %fd4; BB11_9: mov.f64 %fd101, %fd100; mov.u32 %r163, %r162; mov.u32 %r86, %nctaid.x; mad.lo.s32 %r142, %r86, %r77, %r142; setp.lt.u32 %p7, %r142, %r67; mov.u32 %r157, %r163; mov.f64 %fd95, %fd101; @%p7 bra BB11_4; BB11_22: mov.f64 %fd93, %fd95; mul.wide.u32 %rd67, %r71, 8; mov.u64 %rd68, reduce_max_filter_final$__cuda_local_var_44586_33_non_const_sdata; add.s64 %rd7, %rd68, %rd67; st.shared.f64 [%rd7], %fd93; mul.wide.u32 %rd69, %r71, 4; mov.u64 %rd70, reduce_max_filter_final$__cuda_local_var_44587_30_non_const_idxData; add.s64 %rd8, %rd70, %rd69; st.shared.u32 [%rd8], %r157; bar.sync 0; setp.lt.s32 %p18, %r69, 512; @%p18 bra BB11_31; setp.gt.u32 %p19, %r71, 255; mov.f64 %fd94, %fd93; @%p19 bra BB11_30; ld.shared.f64 %fd12, [%rd7+2048]; setp.geu.f64 %p20, %fd93, %fd12; mov.f64 %fd70, %fd93; mov.f64 %fd94, %fd70; @%p20 bra BB11_30; ld.shared.u32 %r27, [%rd8+1024]; div.s32 %r28, %r27, %r68; rem.s32 %r29, %r27, %r68; mov.u32 %r164, 0; setp.lt.s32 %p21, %r70, 1; @%p21 bra BB11_29; BB11_26: cvt.s64.s32 %rd9, %r164; mul.wide.s32 %rd71, %r164, 4; mov.u64 %rd72, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd73, %rd72, %rd71; ld.shared.u32 %r98, [%rd73]; setp.ne.s32 %p22, %r98, %r28; @%p22 bra BB11_28; shl.b64 %rd74, %rd9, 2; mov.u64 %rd75, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd76, %rd75, %rd74; ld.shared.u32 %r99, [%rd76]; setp.eq.s32 %p23, %r99, %r29; mov.f64 %fd71, %fd93; mov.f64 %fd94, %fd71; @%p23 bra BB11_30; BB11_28: add.s32 %r164, %r164, 1; setp.lt.s32 %p24, %r164, %r70; @%p24 bra BB11_26; BB11_29: st.shared.f64 [%rd7], %fd12; st.shared.u32 [%rd8], %r27; mov.f64 %fd94, %fd12; BB11_30: mov.f64 %fd93, %fd94; bar.sync 0; BB11_31: mov.f64 %fd91, %fd93; setp.lt.s32 %p25, %r69, 256; @%p25 bra BB11_40; setp.gt.u32 %p26, %r71, 127; mov.f64 %fd92, %fd91; @%p26 bra BB11_39; ld.shared.f64 %fd15, [%rd7+1024]; setp.geu.f64 %p27, %fd91, %fd15; mov.f64 %fd65, %fd91; mov.f64 %fd92, %fd65; @%p27 bra BB11_39; ld.shared.u32 %r32, [%rd8+512]; div.s32 %r33, %r32, %r68; rem.s32 %r34, %r32, %r68; mov.u32 %r165, 0; setp.lt.s32 %p28, %r70, 1; @%p28 bra BB11_38; BB11_35: cvt.s64.s32 %rd10, %r165; mul.wide.s32 %rd77, %r165, 4; mov.u64 %rd78, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd79, %rd78, %rd77; ld.shared.u32 %r102, [%rd79]; setp.ne.s32 %p29, %r102, %r33; @%p29 bra BB11_37; shl.b64 %rd80, %rd10, 2; mov.u64 %rd81, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd82, %rd81, %rd80; ld.shared.u32 %r103, [%rd82]; setp.eq.s32 %p30, %r103, %r34; mov.f64 %fd66, %fd91; mov.f64 %fd92, %fd66; @%p30 bra BB11_39; BB11_37: add.s32 %r165, %r165, 1; setp.lt.s32 %p31, %r165, %r70; @%p31 bra BB11_35; BB11_38: st.shared.f64 [%rd7], %fd15; st.shared.u32 [%rd8], %r32; mov.f64 %fd92, %fd15; BB11_39: mov.f64 %fd91, %fd92; bar.sync 0; BB11_40: mov.f64 %fd89, %fd91; setp.lt.s32 %p32, %r69, 128; @%p32 bra BB11_49; setp.gt.u32 %p33, %r71, 63; mov.f64 %fd90, %fd89; @%p33 bra BB11_48; ld.shared.f64 %fd18, [%rd7+512]; setp.geu.f64 %p34, %fd89, %fd18; mov.f64 %fd60, %fd89; mov.f64 %fd90, %fd60; @%p34 bra BB11_48; ld.shared.u32 %r37, [%rd8+256]; div.s32 %r38, %r37, %r68; rem.s32 %r39, %r37, %r68; mov.u32 %r166, 0; setp.lt.s32 %p35, %r70, 1; @%p35 bra BB11_47; BB11_44: cvt.s64.s32 %rd11, %r166; mul.wide.s32 %rd83, %r166, 4; mov.u64 %rd84, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd85, %rd84, %rd83; ld.shared.u32 %r106, [%rd85]; setp.ne.s32 %p36, %r106, %r38; @%p36 bra BB11_46; shl.b64 %rd86, %rd11, 2; mov.u64 %rd87, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd88, %rd87, %rd86; ld.shared.u32 %r107, [%rd88]; setp.eq.s32 %p37, %r107, %r39; mov.f64 %fd61, %fd89; mov.f64 %fd90, %fd61; @%p37 bra BB11_48; BB11_46: add.s32 %r166, %r166, 1; setp.lt.s32 %p38, %r166, %r70; @%p38 bra BB11_44; BB11_47: st.shared.f64 [%rd7], %fd18; st.shared.u32 [%rd8], %r37; mov.f64 %fd90, %fd18; BB11_48: mov.f64 %fd89, %fd90; bar.sync 0; BB11_49: mov.f64 %fd20, %fd89; setp.gt.u32 %p39, %r71, 31; @%p39 bra BB11_93; setp.lt.s32 %p40, %r69, 64; mov.f64 %fd88, %fd20; @%p40 bra BB11_57; ld.volatile.shared.f64 %fd34, [%rd7+256]; setp.geu.f64 %p41, %fd20, %fd34; mov.f64 %fd55, %fd20; mov.f64 %fd88, %fd55; @%p41 bra BB11_57; ld.volatile.shared.u32 %r110, [%rd8+128]; div.s32 %r42, %r110, %r68; rem.s32 %r43, %r110, %r68; mov.u32 %r167, 0; setp.lt.s32 %p42, %r70, 1; @%p42 bra BB11_56; BB11_53: cvt.s64.s32 %rd12, %r167; mul.wide.s32 %rd89, %r167, 4; mov.u64 %rd90, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd91, %rd90, %rd89; ld.volatile.shared.u32 %r111, [%rd91]; setp.ne.s32 %p43, %r111, %r42; @%p43 bra BB11_55; shl.b64 %rd92, %rd12, 2; mov.u64 %rd93, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd94, %rd93, %rd92; ld.volatile.shared.u32 %r112, [%rd94]; setp.eq.s32 %p44, %r112, %r43; mov.f64 %fd56, %fd20; mov.f64 %fd88, %fd56; @%p44 bra BB11_57; BB11_55: add.s32 %r167, %r167, 1; setp.lt.s32 %p45, %r167, %r70; @%p45 bra BB11_53; BB11_56: ld.volatile.shared.f64 %fd88, [%rd7+256]; st.volatile.shared.f64 [%rd7], %fd88; ld.volatile.shared.u32 %r113, [%rd8+128]; st.volatile.shared.u32 [%rd8], %r113; BB11_57: mov.f64 %fd22, %fd88; setp.lt.s32 %p46, %r69, 32; mov.f64 %fd87, %fd22; @%p46 bra BB11_64; ld.volatile.shared.f64 %fd35, [%rd7+128]; setp.geu.f64 %p47, %fd22, %fd35; mov.f64 %fd52, %fd22; mov.f64 %fd87, %fd52; @%p47 bra BB11_64; ld.volatile.shared.u32 %r115, [%rd8+64]; div.s32 %r46, %r115, %r68; rem.s32 %r47, %r115, %r68; mov.u32 %r168, 0; setp.lt.s32 %p48, %r70, 1; @%p48 bra BB11_63; BB11_60: cvt.s64.s32 %rd13, %r168; mul.wide.s32 %rd95, %r168, 4; mov.u64 %rd96, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd97, %rd96, %rd95; ld.volatile.shared.u32 %r116, [%rd97]; setp.ne.s32 %p49, %r116, %r46; @%p49 bra BB11_62; shl.b64 %rd98, %rd13, 2; mov.u64 %rd99, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd100, %rd99, %rd98; ld.volatile.shared.u32 %r117, [%rd100]; setp.eq.s32 %p50, %r117, %r47; mov.f64 %fd53, %fd22; mov.f64 %fd87, %fd53; @%p50 bra BB11_64; BB11_62: add.s32 %r168, %r168, 1; setp.lt.s32 %p51, %r168, %r70; @%p51 bra BB11_60; BB11_63: ld.volatile.shared.f64 %fd87, [%rd7+128]; st.volatile.shared.f64 [%rd7], %fd87; ld.volatile.shared.u32 %r118, [%rd8+64]; st.volatile.shared.u32 [%rd8], %r118; BB11_64: mov.f64 %fd24, %fd87; setp.lt.s32 %p52, %r69, 16; mov.f64 %fd86, %fd24; @%p52 bra BB11_71; ld.volatile.shared.f64 %fd36, [%rd7+64]; setp.geu.f64 %p53, %fd24, %fd36; mov.f64 %fd49, %fd24; mov.f64 %fd86, %fd49; @%p53 bra BB11_71; ld.volatile.shared.u32 %r120, [%rd8+32]; div.s32 %r50, %r120, %r68; rem.s32 %r51, %r120, %r68; mov.u32 %r169, 0; setp.lt.s32 %p54, %r70, 1; @%p54 bra BB11_70; BB11_67: cvt.s64.s32 %rd14, %r169; mul.wide.s32 %rd101, %r169, 4; mov.u64 %rd102, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd103, %rd102, %rd101; ld.volatile.shared.u32 %r121, [%rd103]; setp.ne.s32 %p55, %r121, %r50; @%p55 bra BB11_69; shl.b64 %rd104, %rd14, 2; mov.u64 %rd105, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd106, %rd105, %rd104; ld.volatile.shared.u32 %r122, [%rd106]; setp.eq.s32 %p56, %r122, %r51; mov.f64 %fd50, %fd24; mov.f64 %fd86, %fd50; @%p56 bra BB11_71; BB11_69: add.s32 %r169, %r169, 1; setp.lt.s32 %p57, %r169, %r70; @%p57 bra BB11_67; BB11_70: ld.volatile.shared.f64 %fd86, [%rd7+64]; st.volatile.shared.f64 [%rd7], %fd86; ld.volatile.shared.u32 %r123, [%rd8+32]; st.volatile.shared.u32 [%rd8], %r123; BB11_71: mov.f64 %fd26, %fd86; setp.lt.s32 %p58, %r69, 8; mov.f64 %fd85, %fd26; @%p58 bra BB11_78; ld.volatile.shared.f64 %fd37, [%rd7+32]; setp.geu.f64 %p59, %fd26, %fd37; mov.f64 %fd46, %fd26; mov.f64 %fd85, %fd46; @%p59 bra BB11_78; ld.volatile.shared.u32 %r125, [%rd8+16]; div.s32 %r54, %r125, %r68; rem.s32 %r55, %r125, %r68; mov.u32 %r170, 0; setp.lt.s32 %p60, %r70, 1; @%p60 bra BB11_77; BB11_74: cvt.s64.s32 %rd15, %r170; mul.wide.s32 %rd107, %r170, 4; mov.u64 %rd108, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd109, %rd108, %rd107; ld.volatile.shared.u32 %r126, [%rd109]; setp.ne.s32 %p61, %r126, %r54; @%p61 bra BB11_76; shl.b64 %rd110, %rd15, 2; mov.u64 %rd111, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd112, %rd111, %rd110; ld.volatile.shared.u32 %r127, [%rd112]; setp.eq.s32 %p62, %r127, %r55; mov.f64 %fd47, %fd26; mov.f64 %fd85, %fd47; @%p62 bra BB11_78; BB11_76: add.s32 %r170, %r170, 1; setp.lt.s32 %p63, %r170, %r70; @%p63 bra BB11_74; BB11_77: ld.volatile.shared.f64 %fd85, [%rd7+32]; st.volatile.shared.f64 [%rd7], %fd85; ld.volatile.shared.u32 %r128, [%rd8+16]; st.volatile.shared.u32 [%rd8], %r128; BB11_78: mov.f64 %fd28, %fd85; setp.lt.s32 %p64, %r69, 4; mov.f64 %fd84, %fd28; @%p64 bra BB11_85; ld.volatile.shared.f64 %fd38, [%rd7+16]; setp.geu.f64 %p65, %fd28, %fd38; mov.f64 %fd43, %fd28; mov.f64 %fd84, %fd43; @%p65 bra BB11_85; ld.volatile.shared.u32 %r130, [%rd8+8]; div.s32 %r58, %r130, %r68; rem.s32 %r59, %r130, %r68; mov.u32 %r171, 0; setp.lt.s32 %p66, %r70, 1; @%p66 bra BB11_84; BB11_81: cvt.s64.s32 %rd16, %r171; mul.wide.s32 %rd113, %r171, 4; mov.u64 %rd114, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd115, %rd114, %rd113; ld.volatile.shared.u32 %r131, [%rd115]; setp.ne.s32 %p67, %r131, %r58; @%p67 bra BB11_83; shl.b64 %rd116, %rd16, 2; mov.u64 %rd117, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd118, %rd117, %rd116; ld.volatile.shared.u32 %r132, [%rd118]; setp.eq.s32 %p68, %r132, %r59; mov.f64 %fd44, %fd28; mov.f64 %fd84, %fd44; @%p68 bra BB11_85; BB11_83: add.s32 %r171, %r171, 1; setp.lt.s32 %p69, %r171, %r70; @%p69 bra BB11_81; BB11_84: ld.volatile.shared.f64 %fd84, [%rd7+16]; st.volatile.shared.f64 [%rd7], %fd84; ld.volatile.shared.u32 %r133, [%rd8+8]; st.volatile.shared.u32 [%rd8], %r133; BB11_85: setp.lt.s32 %p70, %r69, 2; @%p70 bra BB11_92; ld.volatile.shared.f64 %fd39, [%rd7+8]; setp.geu.f64 %p71, %fd84, %fd39; @%p71 bra BB11_92; ld.volatile.shared.u32 %r135, [%rd8+4]; div.s32 %r62, %r135, %r68; rem.s32 %r63, %r135, %r68; mov.u32 %r172, 0; setp.lt.s32 %p72, %r70, 1; @%p72 bra BB11_91; BB11_88: cvt.s64.s32 %rd17, %r172; mul.wide.s32 %rd119, %r172, 4; mov.u64 %rd120, reduce_max_filter_final$__cuda_local_var_44583_30_non_const_smaxesRow; add.s64 %rd121, %rd120, %rd119; ld.volatile.shared.u32 %r136, [%rd121]; setp.ne.s32 %p73, %r136, %r62; @%p73 bra BB11_90; shl.b64 %rd122, %rd17, 2; mov.u64 %rd123, reduce_max_filter_final$__cuda_local_var_44584_30_non_const_smaxesCol; add.s64 %rd124, %rd123, %rd122; ld.volatile.shared.u32 %r137, [%rd124]; setp.eq.s32 %p74, %r137, %r63; @%p74 bra BB11_92; BB11_90: add.s32 %r172, %r172, 1; setp.lt.s32 %p75, %r172, %r70; @%p75 bra BB11_88; BB11_91: ld.volatile.shared.f64 %fd40, [%rd7+8]; st.volatile.shared.f64 [%rd7], %fd40; ld.volatile.shared.u32 %r138, [%rd8+4]; st.volatile.shared.u32 [%rd8], %r138; BB11_92: bar.sync 0; BB11_93: setp.ne.s32 %p76, %r71, 0; @%p76 bra BB11_96; ld.shared.f64 %fd41, [reduce_max_filter_final$__cuda_local_var_44586_33_non_const_sdata]; cvta.to.global.u64 %rd125, %rd19; mul.wide.u32 %rd126, %r78, 8; add.s64 %rd127, %rd125, %rd126; st.global.f64 [%rd127], %fd41; ld.shared.u32 %r66, [reduce_max_filter_final$__cuda_local_var_44587_30_non_const_idxData]; cvta.to.global.u64 %rd128, %rd20; mul.wide.u32 %rd129, %r78, 4; add.s64 %rd130, %rd128, %rd129; st.global.u32 [%rd130], %r66; mov.u32 %r141, %nctaid.x; setp.ne.s32 %p77, %r141, 1; @%p77 bra BB11_96; cvta.to.global.u64 %rd131, %rd21; mul.wide.s32 %rd132, %r70, 4; add.s64 %rd133, %rd131, %rd132; st.global.u32 [%rd133], %r66; BB11_96: ret; } // .globl reduce_max_filter_main .visible .entry reduce_max_filter_main( .param .u64 reduce_max_filter_main_param_0, .param .u64 reduce_max_filter_main_param_1, .param .u64 reduce_max_filter_main_param_2, .param .u32 reduce_max_filter_main_param_3, .param .u32 reduce_max_filter_main_param_4, .param .u32 reduce_max_filter_main_param_5, .param .u64 reduce_max_filter_main_param_6, .param .u32 reduce_max_filter_main_param_7 ) { .reg .pred %p<87>; .reg .b32 %r<147>; .reg .f64 %fd<88>; .reg .b64 %rd<112>; // demoted variable .shared .align 4 .b8 reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow[40]; // demoted variable .shared .align 4 .b8 reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol[40]; // demoted variable .shared .align 4 .b8 reduce_max_filter_main$__cuda_local_var_44797_30_non_const_smaxesVal[40]; // demoted variable .shared .align 8 .b8 reduce_max_filter_main$__cuda_local_var_44798_33_non_const_sdata[2048]; // demoted variable .shared .align 4 .b8 reduce_max_filter_main$__cuda_local_var_44799_30_non_const_idxData[1024]; ld.param.u64 %rd14, [reduce_max_filter_main_param_0]; ld.param.u64 %rd15, [reduce_max_filter_main_param_1]; ld.param.u64 %rd16, [reduce_max_filter_main_param_2]; ld.param.u32 %r60, [reduce_max_filter_main_param_3]; ld.param.u32 %r61, [reduce_max_filter_main_param_4]; ld.param.u32 %r62, [reduce_max_filter_main_param_5]; ld.param.u64 %rd17, [reduce_max_filter_main_param_6]; ld.param.u32 %r63, [reduce_max_filter_main_param_7]; mov.u32 %r1, %tid.x; setp.ge.u32 %p3, %r1, %r63; @%p3 bra BB12_2; cvta.to.global.u64 %rd18, %rd17; mul.wide.u32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.u32 %r65, [%rd20]; mov.u64 %rd21, reduce_max_filter_main$__cuda_local_var_44797_30_non_const_smaxesVal; add.s64 %rd22, %rd21, %rd19; st.shared.u32 [%rd22], %r65; div.u32 %r66, %r65, %r60; mov.u64 %rd23, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd24, %rd23, %rd19; st.shared.u32 [%rd24], %r66; rem.u32 %r67, %r65, %r60; mov.u64 %rd25, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd26, %rd25, %rd19; st.shared.u32 [%rd26], %r67; BB12_2: mov.u32 %r68, %ctaid.x; mad.lo.s32 %r134, %r68, %r62, %r1; bar.sync 0; mul.lo.s32 %r70, %r61, %r60; mov.f64 %fd84, 0dFFF0000000000000; setp.ge.u32 %p4, %r134, %r70; @%p4 bra BB12_18; mov.f64 %fd87, 0dFFF0000000000000; BB12_4: mov.f64 %fd70, %fd87; mov.f64 %fd85, %fd70; cvta.to.global.u64 %rd27, %rd14; mul.wide.u32 %rd28, %r134, 8; add.s64 %rd29, %rd27, %rd28; ld.global.f64 %fd2, [%rd29]; setp.geu.f64 %p5, %fd85, %fd2; @%p5 bra BB12_10; div.s32 %r6, %r134, %r60; rem.s32 %r7, %r134, %r60; mov.pred %p6, -1; mov.u32 %r135, 0; setp.lt.s32 %p7, %r63, 1; mov.pred %p84, %p6; @%p7 bra BB12_9; BB12_6: cvt.s64.s32 %rd1, %r135; mul.wide.s32 %rd30, %r135, 4; mov.u64 %rd31, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd32, %rd31, %rd30; ld.shared.u32 %r75, [%rd32]; setp.ne.s32 %p8, %r75, %r6; @%p8 bra BB12_8; shl.b64 %rd33, %rd1, 2; mov.u64 %rd34, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd35, %rd34, %rd33; ld.shared.u32 %r76, [%rd35]; setp.eq.s32 %p10, %r76, %r7; mov.pred %p9, 0; mov.pred %p84, %p9; @%p10 bra BB12_9; BB12_8: add.s32 %r135, %r135, 1; setp.lt.s32 %p12, %r135, %r63; mov.pred %p83, %p6; mov.pred %p84, %p83; @%p12 bra BB12_6; BB12_9: selp.f64 %fd85, %fd2, %fd85, %p84; selp.b32 %r137, %r134, %r137, %p84; BB12_10: mov.f64 %fd4, %fd85; add.s32 %r78, %r134, %r62; setp.ge.u32 %p13, %r78, %r70; mov.f64 %fd86, %fd4; @%p13 bra BB12_17; mul.wide.u32 %rd37, %r78, 8; add.s64 %rd38, %rd27, %rd37; ld.global.f64 %fd5, [%rd38]; setp.geu.f64 %p14, %fd4, %fd5; mov.f64 %fd69, %fd4; mov.f64 %fd86, %fd69; @%p14 bra BB12_17; div.s32 %r12, %r78, %r60; rem.s32 %r13, %r78, %r60; mov.pred %p15, -1; mov.u32 %r136, 0; setp.lt.s32 %p16, %r63, 1; mov.pred %p86, %p15; @%p16 bra BB12_16; BB12_13: cvt.s64.s32 %rd2, %r136; mul.wide.s32 %rd39, %r136, 4; mov.u64 %rd40, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd41, %rd40, %rd39; ld.shared.u32 %r82, [%rd41]; setp.ne.s32 %p17, %r82, %r12; @%p17 bra BB12_15; shl.b64 %rd42, %rd2, 2; mov.u64 %rd43, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd44, %rd43, %rd42; ld.shared.u32 %r83, [%rd44]; setp.eq.s32 %p19, %r83, %r13; mov.pred %p18, 0; mov.pred %p86, %p18; @%p19 bra BB12_16; BB12_15: add.s32 %r136, %r136, 1; setp.lt.s32 %p21, %r136, %r63; mov.pred %p85, %p15; mov.pred %p86, %p85; @%p21 bra BB12_13; BB12_16: selp.f64 %fd86, %fd5, %fd4, %p86; selp.b32 %r137, %r78, %r137, %p86; BB12_17: mov.f64 %fd87, %fd86; mov.u32 %r85, %nctaid.x; mad.lo.s32 %r134, %r85, %r62, %r134; setp.lt.u32 %p22, %r134, %r70; mov.f64 %fd84, %fd87; @%p22 bra BB12_4; BB12_18: mov.f64 %fd82, %fd84; mul.wide.u32 %rd45, %r1, 8; mov.u64 %rd46, reduce_max_filter_main$__cuda_local_var_44798_33_non_const_sdata; add.s64 %rd3, %rd46, %rd45; st.shared.f64 [%rd3], %fd82; mul.wide.u32 %rd47, %r1, 4; mov.u64 %rd48, reduce_max_filter_main$__cuda_local_var_44799_30_non_const_idxData; add.s64 %rd4, %rd48, %rd47; st.shared.u32 [%rd4], %r137; bar.sync 0; setp.lt.s32 %p23, %r62, 512; @%p23 bra BB12_27; setp.gt.u32 %p24, %r1, 255; mov.f64 %fd83, %fd82; @%p24 bra BB12_26; ld.shared.f64 %fd9, [%rd3+2048]; setp.geu.f64 %p25, %fd82, %fd9; mov.f64 %fd66, %fd82; mov.f64 %fd83, %fd66; @%p25 bra BB12_26; ld.shared.u32 %r20, [%rd4+1024]; div.s32 %r21, %r20, %r60; rem.s32 %r22, %r20, %r60; mov.u32 %r138, 0; setp.lt.s32 %p26, %r63, 1; @%p26 bra BB12_25; BB12_22: cvt.s64.s32 %rd5, %r138; mul.wide.s32 %rd49, %r138, 4; mov.u64 %rd50, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd51, %rd50, %rd49; ld.shared.u32 %r90, [%rd51]; setp.ne.s32 %p27, %r90, %r21; @%p27 bra BB12_24; shl.b64 %rd52, %rd5, 2; mov.u64 %rd53, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd54, %rd53, %rd52; ld.shared.u32 %r91, [%rd54]; setp.eq.s32 %p28, %r91, %r22; mov.f64 %fd67, %fd82; mov.f64 %fd83, %fd67; @%p28 bra BB12_26; BB12_24: add.s32 %r138, %r138, 1; setp.lt.s32 %p29, %r138, %r63; @%p29 bra BB12_22; BB12_25: st.shared.f64 [%rd3], %fd9; st.shared.u32 [%rd4], %r20; mov.f64 %fd83, %fd9; BB12_26: mov.f64 %fd82, %fd83; bar.sync 0; BB12_27: mov.f64 %fd80, %fd82; setp.lt.s32 %p30, %r62, 256; @%p30 bra BB12_36; setp.gt.u32 %p31, %r1, 127; mov.f64 %fd81, %fd80; @%p31 bra BB12_35; ld.shared.f64 %fd12, [%rd3+1024]; setp.geu.f64 %p32, %fd80, %fd12; mov.f64 %fd61, %fd80; mov.f64 %fd81, %fd61; @%p32 bra BB12_35; ld.shared.u32 %r25, [%rd4+512]; div.s32 %r26, %r25, %r60; rem.s32 %r27, %r25, %r60; mov.u32 %r139, 0; setp.lt.s32 %p33, %r63, 1; @%p33 bra BB12_34; BB12_31: cvt.s64.s32 %rd6, %r139; mul.wide.s32 %rd55, %r139, 4; mov.u64 %rd56, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd57, %rd56, %rd55; ld.shared.u32 %r94, [%rd57]; setp.ne.s32 %p34, %r94, %r26; @%p34 bra BB12_33; shl.b64 %rd58, %rd6, 2; mov.u64 %rd59, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd60, %rd59, %rd58; ld.shared.u32 %r95, [%rd60]; setp.eq.s32 %p35, %r95, %r27; mov.f64 %fd62, %fd80; mov.f64 %fd81, %fd62; @%p35 bra BB12_35; BB12_33: add.s32 %r139, %r139, 1; setp.lt.s32 %p36, %r139, %r63; @%p36 bra BB12_31; BB12_34: st.shared.f64 [%rd3], %fd12; st.shared.u32 [%rd4], %r25; mov.f64 %fd81, %fd12; BB12_35: mov.f64 %fd80, %fd81; bar.sync 0; BB12_36: mov.f64 %fd78, %fd80; setp.lt.s32 %p37, %r62, 128; @%p37 bra BB12_45; setp.gt.u32 %p38, %r1, 63; mov.f64 %fd79, %fd78; @%p38 bra BB12_44; ld.shared.f64 %fd15, [%rd3+512]; setp.geu.f64 %p39, %fd78, %fd15; mov.f64 %fd56, %fd78; mov.f64 %fd79, %fd56; @%p39 bra BB12_44; ld.shared.u32 %r30, [%rd4+256]; div.s32 %r31, %r30, %r60; rem.s32 %r32, %r30, %r60; mov.u32 %r140, 0; setp.lt.s32 %p40, %r63, 1; @%p40 bra BB12_43; BB12_40: cvt.s64.s32 %rd7, %r140; mul.wide.s32 %rd61, %r140, 4; mov.u64 %rd62, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd63, %rd62, %rd61; ld.shared.u32 %r98, [%rd63]; setp.ne.s32 %p41, %r98, %r31; @%p41 bra BB12_42; shl.b64 %rd64, %rd7, 2; mov.u64 %rd65, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd66, %rd65, %rd64; ld.shared.u32 %r99, [%rd66]; setp.eq.s32 %p42, %r99, %r32; mov.f64 %fd57, %fd78; mov.f64 %fd79, %fd57; @%p42 bra BB12_44; BB12_42: add.s32 %r140, %r140, 1; setp.lt.s32 %p43, %r140, %r63; @%p43 bra BB12_40; BB12_43: st.shared.f64 [%rd3], %fd15; st.shared.u32 [%rd4], %r30; mov.f64 %fd79, %fd15; BB12_44: mov.f64 %fd78, %fd79; bar.sync 0; BB12_45: mov.f64 %fd17, %fd78; setp.gt.u32 %p44, %r1, 31; @%p44 bra BB12_89; setp.lt.s32 %p45, %r62, 64; mov.f64 %fd77, %fd17; @%p45 bra BB12_53; ld.volatile.shared.f64 %fd30, [%rd3+256]; setp.geu.f64 %p46, %fd17, %fd30; mov.f64 %fd51, %fd17; mov.f64 %fd77, %fd51; @%p46 bra BB12_53; ld.volatile.shared.u32 %r102, [%rd4+128]; div.s32 %r35, %r102, %r60; rem.s32 %r36, %r102, %r60; mov.u32 %r141, 0; setp.lt.s32 %p47, %r63, 1; @%p47 bra BB12_52; BB12_49: cvt.s64.s32 %rd8, %r141; mul.wide.s32 %rd67, %r141, 4; mov.u64 %rd68, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd69, %rd68, %rd67; ld.volatile.shared.u32 %r103, [%rd69]; setp.ne.s32 %p48, %r103, %r35; @%p48 bra BB12_51; shl.b64 %rd70, %rd8, 2; mov.u64 %rd71, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd72, %rd71, %rd70; ld.volatile.shared.u32 %r104, [%rd72]; setp.eq.s32 %p49, %r104, %r36; mov.f64 %fd52, %fd17; mov.f64 %fd77, %fd52; @%p49 bra BB12_53; BB12_51: add.s32 %r141, %r141, 1; setp.lt.s32 %p50, %r141, %r63; @%p50 bra BB12_49; BB12_52: ld.volatile.shared.f64 %fd77, [%rd3+256]; st.volatile.shared.f64 [%rd3], %fd77; ld.volatile.shared.u32 %r105, [%rd4+128]; st.volatile.shared.u32 [%rd4], %r105; BB12_53: mov.f64 %fd19, %fd77; setp.lt.s32 %p51, %r62, 32; mov.f64 %fd76, %fd19; @%p51 bra BB12_60; ld.volatile.shared.f64 %fd31, [%rd3+128]; setp.geu.f64 %p52, %fd19, %fd31; mov.f64 %fd48, %fd19; mov.f64 %fd76, %fd48; @%p52 bra BB12_60; ld.volatile.shared.u32 %r107, [%rd4+64]; div.s32 %r39, %r107, %r60; rem.s32 %r40, %r107, %r60; mov.u32 %r142, 0; setp.lt.s32 %p53, %r63, 1; @%p53 bra BB12_59; BB12_56: cvt.s64.s32 %rd9, %r142; mul.wide.s32 %rd73, %r142, 4; mov.u64 %rd74, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd75, %rd74, %rd73; ld.volatile.shared.u32 %r108, [%rd75]; setp.ne.s32 %p54, %r108, %r39; @%p54 bra BB12_58; shl.b64 %rd76, %rd9, 2; mov.u64 %rd77, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd78, %rd77, %rd76; ld.volatile.shared.u32 %r109, [%rd78]; setp.eq.s32 %p55, %r109, %r40; mov.f64 %fd49, %fd19; mov.f64 %fd76, %fd49; @%p55 bra BB12_60; BB12_58: add.s32 %r142, %r142, 1; setp.lt.s32 %p56, %r142, %r63; @%p56 bra BB12_56; BB12_59: ld.volatile.shared.f64 %fd76, [%rd3+128]; st.volatile.shared.f64 [%rd3], %fd76; ld.volatile.shared.u32 %r110, [%rd4+64]; st.volatile.shared.u32 [%rd4], %r110; BB12_60: mov.f64 %fd21, %fd76; setp.lt.s32 %p57, %r62, 16; mov.f64 %fd75, %fd21; @%p57 bra BB12_67; ld.volatile.shared.f64 %fd32, [%rd3+64]; setp.geu.f64 %p58, %fd21, %fd32; mov.f64 %fd45, %fd21; mov.f64 %fd75, %fd45; @%p58 bra BB12_67; ld.volatile.shared.u32 %r112, [%rd4+32]; div.s32 %r43, %r112, %r60; rem.s32 %r44, %r112, %r60; mov.u32 %r143, 0; setp.lt.s32 %p59, %r63, 1; @%p59 bra BB12_66; BB12_63: cvt.s64.s32 %rd10, %r143; mul.wide.s32 %rd79, %r143, 4; mov.u64 %rd80, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd81, %rd80, %rd79; ld.volatile.shared.u32 %r113, [%rd81]; setp.ne.s32 %p60, %r113, %r43; @%p60 bra BB12_65; shl.b64 %rd82, %rd10, 2; mov.u64 %rd83, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd84, %rd83, %rd82; ld.volatile.shared.u32 %r114, [%rd84]; setp.eq.s32 %p61, %r114, %r44; mov.f64 %fd46, %fd21; mov.f64 %fd75, %fd46; @%p61 bra BB12_67; BB12_65: add.s32 %r143, %r143, 1; setp.lt.s32 %p62, %r143, %r63; @%p62 bra BB12_63; BB12_66: ld.volatile.shared.f64 %fd75, [%rd3+64]; st.volatile.shared.f64 [%rd3], %fd75; ld.volatile.shared.u32 %r115, [%rd4+32]; st.volatile.shared.u32 [%rd4], %r115; BB12_67: mov.f64 %fd23, %fd75; setp.lt.s32 %p63, %r62, 8; mov.f64 %fd74, %fd23; @%p63 bra BB12_74; ld.volatile.shared.f64 %fd33, [%rd3+32]; setp.geu.f64 %p64, %fd23, %fd33; mov.f64 %fd42, %fd23; mov.f64 %fd74, %fd42; @%p64 bra BB12_74; ld.volatile.shared.u32 %r117, [%rd4+16]; div.s32 %r47, %r117, %r60; rem.s32 %r48, %r117, %r60; mov.u32 %r144, 0; setp.lt.s32 %p65, %r63, 1; @%p65 bra BB12_73; BB12_70: cvt.s64.s32 %rd11, %r144; mul.wide.s32 %rd85, %r144, 4; mov.u64 %rd86, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd87, %rd86, %rd85; ld.volatile.shared.u32 %r118, [%rd87]; setp.ne.s32 %p66, %r118, %r47; @%p66 bra BB12_72; shl.b64 %rd88, %rd11, 2; mov.u64 %rd89, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd90, %rd89, %rd88; ld.volatile.shared.u32 %r119, [%rd90]; setp.eq.s32 %p67, %r119, %r48; mov.f64 %fd43, %fd23; mov.f64 %fd74, %fd43; @%p67 bra BB12_74; BB12_72: add.s32 %r144, %r144, 1; setp.lt.s32 %p68, %r144, %r63; @%p68 bra BB12_70; BB12_73: ld.volatile.shared.f64 %fd74, [%rd3+32]; st.volatile.shared.f64 [%rd3], %fd74; ld.volatile.shared.u32 %r120, [%rd4+16]; st.volatile.shared.u32 [%rd4], %r120; BB12_74: mov.f64 %fd25, %fd74; setp.lt.s32 %p69, %r62, 4; mov.f64 %fd73, %fd25; @%p69 bra BB12_81; ld.volatile.shared.f64 %fd34, [%rd3+16]; setp.geu.f64 %p70, %fd25, %fd34; mov.f64 %fd39, %fd25; mov.f64 %fd73, %fd39; @%p70 bra BB12_81; ld.volatile.shared.u32 %r122, [%rd4+8]; div.s32 %r51, %r122, %r60; rem.s32 %r52, %r122, %r60; mov.u32 %r145, 0; setp.lt.s32 %p71, %r63, 1; @%p71 bra BB12_80; BB12_77: cvt.s64.s32 %rd12, %r145; mul.wide.s32 %rd91, %r145, 4; mov.u64 %rd92, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd93, %rd92, %rd91; ld.volatile.shared.u32 %r123, [%rd93]; setp.ne.s32 %p72, %r123, %r51; @%p72 bra BB12_79; shl.b64 %rd94, %rd12, 2; mov.u64 %rd95, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd96, %rd95, %rd94; ld.volatile.shared.u32 %r124, [%rd96]; setp.eq.s32 %p73, %r124, %r52; mov.f64 %fd40, %fd25; mov.f64 %fd73, %fd40; @%p73 bra BB12_81; BB12_79: add.s32 %r145, %r145, 1; setp.lt.s32 %p74, %r145, %r63; @%p74 bra BB12_77; BB12_80: ld.volatile.shared.f64 %fd73, [%rd3+16]; st.volatile.shared.f64 [%rd3], %fd73; ld.volatile.shared.u32 %r125, [%rd4+8]; st.volatile.shared.u32 [%rd4], %r125; BB12_81: setp.lt.s32 %p75, %r62, 2; @%p75 bra BB12_88; ld.volatile.shared.f64 %fd35, [%rd3+8]; setp.geu.f64 %p76, %fd73, %fd35; @%p76 bra BB12_88; ld.volatile.shared.u32 %r127, [%rd4+4]; div.s32 %r55, %r127, %r60; rem.s32 %r56, %r127, %r60; mov.u32 %r146, 0; setp.lt.s32 %p77, %r63, 1; @%p77 bra BB12_87; BB12_84: cvt.s64.s32 %rd13, %r146; mul.wide.s32 %rd97, %r146, 4; mov.u64 %rd98, reduce_max_filter_main$__cuda_local_var_44795_30_non_const_smaxesRow; add.s64 %rd99, %rd98, %rd97; ld.volatile.shared.u32 %r128, [%rd99]; setp.ne.s32 %p78, %r128, %r55; @%p78 bra BB12_86; shl.b64 %rd100, %rd13, 2; mov.u64 %rd101, reduce_max_filter_main$__cuda_local_var_44796_30_non_const_smaxesCol; add.s64 %rd102, %rd101, %rd100; ld.volatile.shared.u32 %r129, [%rd102]; setp.eq.s32 %p79, %r129, %r56; @%p79 bra BB12_88; BB12_86: add.s32 %r146, %r146, 1; setp.lt.s32 %p80, %r146, %r63; @%p80 bra BB12_84; BB12_87: ld.volatile.shared.f64 %fd36, [%rd3+8]; st.volatile.shared.f64 [%rd3], %fd36; ld.volatile.shared.u32 %r130, [%rd4+4]; st.volatile.shared.u32 [%rd4], %r130; BB12_88: bar.sync 0; BB12_89: setp.ne.s32 %p81, %r1, 0; @%p81 bra BB12_92; ld.shared.f64 %fd37, [reduce_max_filter_main$__cuda_local_var_44798_33_non_const_sdata]; cvta.to.global.u64 %rd103, %rd15; mul.wide.u32 %rd104, %r68, 8; add.s64 %rd105, %rd103, %rd104; st.global.f64 [%rd105], %fd37; ld.shared.u32 %r59, [reduce_max_filter_main$__cuda_local_var_44799_30_non_const_idxData]; cvta.to.global.u64 %rd106, %rd16; mul.wide.u32 %rd107, %r68, 4; add.s64 %rd108, %rd106, %rd107; st.global.u32 [%rd108], %r59; mov.u32 %r133, %nctaid.x; setp.ne.s32 %p82, %r133, 1; @%p82 bra BB12_92; cvta.to.global.u64 %rd109, %rd17; mul.wide.s32 %rd110, %r63, 4; add.s64 %rd111, %rd109, %rd110; st.global.u32 [%rd111], %r59; BB12_92: ret; } // .globl elt_prod_conjf .visible .entry elt_prod_conjf( .param .u64 elt_prod_conjf_param_0, .param .u64 elt_prod_conjf_param_1, .param .u64 elt_prod_conjf_param_2, .param .u32 elt_prod_conjf_param_3 ) { .reg .pred %p<10>; .reg .f32 %f<41>; .reg .b32 %r<7>; .reg .b64 %rd<22>; // demoted variable .shared .align 8 .b8 elt_prod_conjf$__cuda_local_var_45052_39_non_const_sfc[2048]; // demoted variable .shared .align 8 .b8 elt_prod_conjf$__cuda_local_var_45053_39_non_const_sc1[2048]; // demoted variable .shared .align 8 .b8 elt_prod_conjf$__cuda_local_var_45054_39_non_const_sc2[2048]; ld.param.u64 %rd6, [elt_prod_conjf_param_0]; ld.param.u64 %rd7, [elt_prod_conjf_param_1]; ld.param.u64 %rd8, [elt_prod_conjf_param_2]; ld.param.u32 %r3, [elt_prod_conjf_param_3]; mov.u32 %r4, %ctaid.x; shl.b32 %r5, %r4, 8; mov.u32 %r1, %tid.x; add.s32 %r2, %r5, %r1; setp.ge.s32 %p1, %r2, %r3; @%p1 bra BB13_5; cvta.to.global.u64 %rd9, %rd7; cvt.u64.u32 %rd1, %r1; mul.wide.u32 %rd10, %r1, 8; mov.u64 %rd11, elt_prod_conjf$__cuda_local_var_45053_39_non_const_sc1; add.s64 %rd2, %rd11, %rd10; cvt.s64.s32 %rd3, %r2; mul.wide.s32 %rd12, %r2, 8; add.s64 %rd13, %rd9, %rd12; ld.global.v2.f32 {%f6, %f7}, [%rd13]; st.shared.v2.f32 [%rd2], {%f6, %f7}; mov.u64 %rd14, elt_prod_conjf$__cuda_local_var_45054_39_non_const_sc2; add.s64 %rd4, %rd14, %rd10; cvta.to.global.u64 %rd15, %rd8; add.s64 %rd16, %rd15, %rd12; ld.global.v2.f32 {%f10, %f11}, [%rd16]; st.shared.v2.f32 [%rd4], {%f10, %f11}; bar.sync 0; ld.shared.v2.f32 {%f14, %f15}, [%rd4]; ld.shared.v2.f32 {%f18, %f19}, [%rd2]; mul.f32 %f22, %f19, %f15; fma.rn.f32 %f39, %f18, %f14, %f22; mul.f32 %f23, %f18, %f15; mul.f32 %f24, %f19, %f14; sub.f32 %f2, %f24, %f23; shl.b64 %rd17, %rd1, 3; mov.u64 %rd18, elt_prod_conjf$__cuda_local_var_45052_39_non_const_sfc; add.s64 %rd5, %rd18, %rd17; st.shared.v2.f32 [%rd5], {%f39, %f2}; abs.f32 %f25, %f39; abs.f32 %f26, %f2; setp.gt.f32 %p2, %f25, %f26; selp.f32 %f27, %f25, %f26, %p2; selp.f32 %f28, %f26, %f25, %p2; div.rn.f32 %f29, %f28, %f27; fma.rn.f32 %f30, %f29, %f29, 0f3F800000; sqrt.rn.f32 %f31, %f30; mul.f32 %f32, %f27, %f31; setp.eq.f32 %p3, %f27, 0f00000000; setp.gt.f32 %p4, %f27, 0f7F7FFFFF; or.pred %p5, %p3, %p4; setp.gt.f32 %p6, %f28, 0f7F7FFFFF; or.pred %p7, %p5, %p6; add.f32 %f33, %f27, %f28; selp.f32 %f40, %f33, %f32, %p7; setp.eq.f32 %p8, %f40, 0f00000000; @%p8 bra BB13_3; abs.f32 %f34, %f40; setp.le.f32 %p9, %f34, 0f7F800000; @%p9 bra BB13_4; BB13_3: mov.u32 %r6, 872415232; st.shared.u32 [%rd5], %r6; mov.f32 %f40, 0f34000000; mov.f32 %f39, %f40; BB13_4: cvta.to.global.u64 %rd19, %rd6; shl.b64 %rd20, %rd3, 3; add.s64 %rd21, %rd19, %rd20; div.rn.f32 %f37, %f2, %f40; div.rn.f32 %f38, %f39, %f40; st.global.v2.f32 [%rd21], {%f38, %f37}; BB13_5: ret; } // .globl elt_prod_conj_v2f .visible .entry elt_prod_conj_v2f( .param .u64 elt_prod_conj_v2f_param_0, .param .u64 elt_prod_conj_v2f_param_1, .param .u64 elt_prod_conj_v2f_param_2, .param .u32 elt_prod_conj_v2f_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<30>; .reg .b32 %r<7>; .reg .b64 %rd<16>; // demoted variable .shared .align 8 .b8 elt_prod_conj_v2f$__cuda_local_var_45085_39_non_const_sfc[2048]; ld.param.u64 %rd3, [elt_prod_conj_v2f_param_0]; ld.param.u64 %rd4, [elt_prod_conj_v2f_param_1]; ld.param.u64 %rd5, [elt_prod_conj_v2f_param_2]; ld.param.u32 %r3, [elt_prod_conj_v2f_param_3]; mov.u32 %r4, %ctaid.x; shl.b32 %r5, %r4, 8; mov.u32 %r1, %tid.x; add.s32 %r2, %r5, %r1; setp.ge.s32 %p1, %r2, %r3; @%p1 bra BB14_4; cvta.to.global.u64 %rd6, %rd4; cvt.s64.s32 %rd1, %r2; mul.wide.s32 %rd7, %r2, 8; add.s64 %rd8, %rd6, %rd7; cvta.to.global.u64 %rd9, %rd5; add.s64 %rd10, %rd9, %rd7; ld.global.v2.f32 {%f6, %f7}, [%rd10]; ld.global.v2.f32 {%f10, %f11}, [%rd8]; mul.f32 %f14, %f11, %f7; mul.f32 %f15, %f10, %f7; mul.f32 %f16, %f11, %f6; mul.wide.u32 %rd11, %r1, 8; mov.u64 %rd12, elt_prod_conj_v2f$__cuda_local_var_45085_39_non_const_sfc; add.s64 %rd2, %rd12, %rd11; sub.f32 %f17, %f16, %f15; fma.rn.f32 %f18, %f10, %f6, %f14; st.shared.v2.f32 [%rd2], {%f18, %f17}; bar.sync 0; ld.shared.v2.f32 {%f19, %f20}, [%rd2]; mov.f32 %f28, %f19; mul.f32 %f21, %f20, %f20; fma.rn.f32 %f22, %f19, %f19, %f21; sqrt.rn.f32 %f29, %f22; abs.f32 %f23, %f29; setp.gtu.f32 %p2, %f23, 0f7F800000; setp.eq.f32 %p3, %f29, 0f00000000; or.pred %p4, %p2, %p3; @!%p4 bra BB14_3; bra.uni BB14_2; BB14_2: mov.u32 %r6, 872415232; st.shared.u32 [%rd2], %r6; mov.f32 %f29, 0f34000000; mov.f32 %f28, %f29; BB14_3: cvta.to.global.u64 %rd13, %rd3; shl.b64 %rd14, %rd1, 3; add.s64 %rd15, %rd13, %rd14; div.rn.f32 %f26, %f20, %f29; div.rn.f32 %f27, %f28, %f29; st.global.v2.f32 [%rd15], {%f27, %f26}; BB14_4: ret; } // .globl elt_prod_conj_v3f .visible .entry elt_prod_conj_v3f( .param .u64 elt_prod_conj_v3f_param_0, .param .u64 elt_prod_conj_v3f_param_1, .param .u64 elt_prod_conj_v3f_param_2, .param .u32 elt_prod_conj_v3f_param_3 ) { .reg .pred %p<12>; .reg .f32 %f<33>; .reg .b32 %r<6>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [elt_prod_conj_v3f_param_0]; ld.param.u64 %rd3, [elt_prod_conj_v3f_param_1]; ld.param.u64 %rd4, [elt_prod_conj_v3f_param_2]; ld.param.u32 %r2, [elt_prod_conj_v3f_param_3]; mov.u32 %r3, %ctaid.x; shl.b32 %r4, %r3, 8; mov.u32 %r5, %tid.x; add.s32 %r1, %r4, %r5; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB15_4; cvta.to.global.u64 %rd5, %rd3; cvt.s64.s32 %rd1, %r1; mul.wide.s32 %rd6, %r1, 8; add.s64 %rd7, %rd5, %rd6; cvta.to.global.u64 %rd8, %rd4; add.s64 %rd9, %rd8, %rd6; ld.global.v2.f32 {%f6, %f7}, [%rd9]; ld.global.v2.f32 {%f10, %f11}, [%rd7]; mul.f32 %f14, %f11, %f7; fma.rn.f32 %f1, %f10, %f6, %f14; mul.f32 %f15, %f10, %f7; mul.f32 %f16, %f11, %f6; sub.f32 %f2, %f16, %f15; mul.f32 %f17, %f2, %f2; fma.rn.f32 %f18, %f1, %f1, %f17; sqrt.rn.f32 %f32, %f18; abs.f32 %f19, %f32; setp.gtu.f32 %p2, %f19, 0f7F800000; setp.eq.f32 %p3, %f32, 0f00000000; or.pred %p4, %p2, %p3; @!%p4 bra BB15_3; bra.uni BB15_2; BB15_2: abs.f32 %f20, %f1; abs.f32 %f21, %f2; setp.gt.f32 %p5, %f20, %f21; selp.f32 %f22, %f20, %f21, %p5; selp.f32 %f23, %f21, %f20, %p5; div.rn.f32 %f24, %f23, %f22; fma.rn.f32 %f25, %f24, %f24, 0f3F800000; sqrt.rn.f32 %f26, %f25; mul.f32 %f27, %f22, %f26; setp.eq.f32 %p6, %f22, 0f00000000; setp.gt.f32 %p7, %f22, 0f7F7FFFFF; or.pred %p8, %p6, %p7; setp.gt.f32 %p9, %f23, 0f7F7FFFFF; or.pred %p10, %p8, %p9; add.f32 %f28, %f22, %f23; selp.f32 %f32, %f28, %f27, %p10; BB15_3: cvta.to.global.u64 %rd10, %rd2; setp.eq.f32 %p11, %f32, 0f00000000; selp.f32 %f29, 0f34000000, %f32, %p11; shl.b64 %rd11, %rd1, 3; add.s64 %rd12, %rd10, %rd11; div.rn.f32 %f30, %f2, %f29; div.rn.f32 %f31, %f1, %f29; st.global.v2.f32 [%rd12], {%f31, %f30}; BB15_4: ret; } // .globl reduce_max_finalf .visible .entry reduce_max_finalf( .param .u64 reduce_max_finalf_param_0, .param .u64 reduce_max_finalf_param_1, .param .u64 reduce_max_finalf_param_2, .param .u32 reduce_max_finalf_param_3, .param .u32 reduce_max_finalf_param_4 ) { .reg .pred %p<29>; .reg .f32 %f<78>; .reg .b32 %r<38>; .reg .b64 %rd<28>; // demoted variable .shared .align 4 .b8 reduce_max_finalf$__cuda_local_var_45150_32_non_const_sdata[1024]; // demoted variable .shared .align 4 .b8 reduce_max_finalf$__cuda_local_var_45151_30_non_const_idxData[1024]; ld.param.u64 %rd5, [reduce_max_finalf_param_0]; ld.param.u64 %rd6, [reduce_max_finalf_param_1]; ld.param.u64 %rd7, [reduce_max_finalf_param_2]; ld.param.u32 %r11, [reduce_max_finalf_param_3]; ld.param.u32 %r12, [reduce_max_finalf_param_4]; mov.u32 %r14, %tid.x; shl.b32 %r15, %r12, 1; mov.u32 %r16, %ctaid.x; mad.lo.s32 %r36, %r16, %r15, %r14; mov.f32 %f74, 0f00000000; mov.f32 %f77, %f74; setp.ge.u32 %p1, %r36, %r11; @%p1 bra BB16_7; BB16_1: mov.f32 %f60, %f77; mov.f32 %f1, %f60; cvta.to.global.u64 %rd8, %rd5; cvt.u64.u32 %rd1, %r36; mul.wide.u32 %rd9, %r36, 4; add.s64 %rd10, %rd8, %rd9; ld.global.f32 %f2, [%rd10]; setp.geu.f32 %p2, %f1, %f2; mov.f32 %f75, %f1; @%p2 bra BB16_3; cvta.to.global.u64 %rd11, %rd7; shl.b64 %rd12, %rd1, 2; add.s64 %rd13, %rd11, %rd12; ld.global.u32 %r37, [%rd13]; mov.f32 %f75, %f2; BB16_3: mov.f32 %f3, %f75; add.s32 %r6, %r36, %r12; setp.ge.u32 %p3, %r6, %r11; mov.f32 %f76, %f3; @%p3 bra BB16_6; cvt.u64.u32 %rd2, %r6; mul.wide.u32 %rd15, %r6, 4; add.s64 %rd16, %rd8, %rd15; ld.global.f32 %f4, [%rd16]; setp.geu.f32 %p4, %f3, %f4; mov.f32 %f59, %f3; mov.f32 %f76, %f59; @%p4 bra BB16_6; cvta.to.global.u64 %rd17, %rd7; shl.b64 %rd18, %rd2, 2; add.s64 %rd19, %rd17, %rd18; ld.global.u32 %r37, [%rd19]; mov.f32 %f76, %f4; BB16_6: mov.f32 %f77, %f76; mov.u32 %r18, %nctaid.x; mad.lo.s32 %r36, %r18, %r15, %r36; setp.lt.u32 %p5, %r36, %r11; mov.f32 %f74, %f77; @%p5 bra BB16_1; BB16_7: mov.f32 %f72, %f74; mul.wide.u32 %rd20, %r14, 4; mov.u64 %rd21, reduce_max_finalf$__cuda_local_var_45150_32_non_const_sdata; add.s64 %rd3, %rd21, %rd20; st.shared.f32 [%rd3], %f72; mov.u64 %rd22, reduce_max_finalf$__cuda_local_var_45151_30_non_const_idxData; add.s64 %rd4, %rd22, %rd20; st.shared.u32 [%rd4], %r37; bar.sync 0; setp.lt.s32 %p6, %r12, 512; @%p6 bra BB16_12; setp.gt.u32 %p7, %r14, 255; mov.f32 %f73, %f72; @%p7 bra BB16_11; ld.shared.f32 %f7, [%rd3+1024]; setp.geu.f32 %p8, %f72, %f7; mov.f32 %f57, %f72; mov.f32 %f73, %f57; @%p8 bra BB16_11; st.shared.f32 [%rd3], %f7; ld.shared.u32 %r21, [%rd4+1024]; st.shared.u32 [%rd4], %r21; mov.f32 %f73, %f7; BB16_11: mov.f32 %f72, %f73; bar.sync 0; BB16_12: mov.f32 %f70, %f72; setp.lt.s32 %p9, %r12, 256; @%p9 bra BB16_17; setp.gt.u32 %p10, %r14, 127; mov.f32 %f71, %f70; @%p10 bra BB16_16; ld.shared.f32 %f10, [%rd3+512]; setp.geu.f32 %p11, %f70, %f10; mov.f32 %f53, %f70; mov.f32 %f71, %f53; @%p11 bra BB16_16; st.shared.f32 [%rd3], %f10; ld.shared.u32 %r23, [%rd4+512]; st.shared.u32 [%rd4], %r23; mov.f32 %f71, %f10; BB16_16: mov.f32 %f70, %f71; bar.sync 0; BB16_17: mov.f32 %f68, %f70; setp.lt.s32 %p12, %r12, 128; @%p12 bra BB16_22; setp.gt.u32 %p13, %r14, 63; mov.f32 %f69, %f68; @%p13 bra BB16_21; ld.shared.f32 %f13, [%rd3+256]; setp.geu.f32 %p14, %f68, %f13; mov.f32 %f49, %f68; mov.f32 %f69, %f49; @%p14 bra BB16_21; st.shared.f32 [%rd3], %f13; ld.shared.u32 %r25, [%rd4+256]; st.shared.u32 [%rd4], %r25; mov.f32 %f69, %f13; BB16_21: mov.f32 %f68, %f69; bar.sync 0; BB16_22: mov.f32 %f67, %f68; setp.gt.u32 %p15, %r14, 31; @%p15 bra BB16_42; setp.lt.s32 %p16, %r12, 64; @%p16 bra BB16_26; ld.volatile.shared.f32 %f28, [%rd3+128]; setp.geu.f32 %p17, %f67, %f28; @%p17 bra BB16_26; ld.volatile.shared.f32 %f67, [%rd3+128]; st.volatile.shared.f32 [%rd3], %f67; ld.volatile.shared.u32 %r27, [%rd4+128]; st.volatile.shared.u32 [%rd4], %r27; BB16_26: mov.f32 %f66, %f67; setp.lt.s32 %p18, %r12, 32; @%p18 bra BB16_29; ld.volatile.shared.f32 %f29, [%rd3+64]; setp.geu.f32 %p19, %f66, %f29; @%p19 bra BB16_29; ld.volatile.shared.f32 %f66, [%rd3+64]; st.volatile.shared.f32 [%rd3], %f66; ld.volatile.shared.u32 %r28, [%rd4+64]; st.volatile.shared.u32 [%rd4], %r28; BB16_29: mov.f32 %f65, %f66; setp.lt.s32 %p20, %r12, 16; @%p20 bra BB16_32; ld.volatile.shared.f32 %f30, [%rd3+32]; setp.geu.f32 %p21, %f65, %f30; @%p21 bra BB16_32; ld.volatile.shared.f32 %f65, [%rd3+32]; st.volatile.shared.f32 [%rd3], %f65; ld.volatile.shared.u32 %r29, [%rd4+32]; st.volatile.shared.u32 [%rd4], %r29; BB16_32: mov.f32 %f64, %f65; setp.lt.s32 %p22, %r12, 8; @%p22 bra BB16_35; ld.volatile.shared.f32 %f31, [%rd3+16]; setp.geu.f32 %p23, %f64, %f31; @%p23 bra BB16_35; ld.volatile.shared.f32 %f64, [%rd3+16]; st.volatile.shared.f32 [%rd3], %f64; ld.volatile.shared.u32 %r30, [%rd4+16]; st.volatile.shared.u32 [%rd4], %r30; BB16_35: mov.f32 %f63, %f64; setp.lt.s32 %p24, %r12, 4; @%p24 bra BB16_38; ld.volatile.shared.f32 %f32, [%rd3+8]; setp.geu.f32 %p25, %f63, %f32; @%p25 bra BB16_38; ld.volatile.shared.f32 %f63, [%rd3+8]; st.volatile.shared.f32 [%rd3], %f63; ld.volatile.shared.u32 %r31, [%rd4+8]; st.volatile.shared.u32 [%rd4], %r31; BB16_38: setp.lt.s32 %p26, %r12, 2; @%p26 bra BB16_41; ld.volatile.shared.f32 %f33, [%rd3+4]; setp.geu.f32 %p27, %f63, %f33; @%p27 bra BB16_41; ld.volatile.shared.f32 %f34, [%rd3+4]; st.volatile.shared.f32 [%rd3], %f34; ld.volatile.shared.u32 %r32, [%rd4+4]; st.volatile.shared.u32 [%rd4], %r32; BB16_41: bar.sync 0; BB16_42: setp.ne.s32 %p28, %r14, 0; @%p28 bra BB16_44; ld.shared.f32 %f35, [reduce_max_finalf$__cuda_local_var_45150_32_non_const_sdata]; cvta.to.global.u64 %rd23, %rd6; mul.wide.u32 %rd24, %r16, 4; add.s64 %rd25, %rd23, %rd24; st.global.f32 [%rd25], %f35; ld.shared.u32 %r35, [reduce_max_finalf$__cuda_local_var_45151_30_non_const_idxData]; cvta.to.global.u64 %rd26, %rd7; add.s64 %rd27, %rd26, %rd24; st.global.u32 [%rd27], %r35; BB16_44: ret; } // .globl reduce_max_mainf .visible .entry reduce_max_mainf( .param .u64 reduce_max_mainf_param_0, .param .u64 reduce_max_mainf_param_1, .param .u64 reduce_max_mainf_param_2, .param .u32 reduce_max_mainf_param_3, .param .u32 reduce_max_mainf_param_4 ) { .reg .pred %p<29>; .reg .f32 %f<75>; .reg .b32 %r<39>; .reg .b64 %rd<20>; // demoted variable .shared .align 4 .b8 reduce_max_mainf$__cuda_local_var_45285_32_non_const_sdata[1024]; // demoted variable .shared .align 4 .b8 reduce_max_mainf$__cuda_local_var_45286_30_non_const_idxData[1024]; ld.param.u64 %rd3, [reduce_max_mainf_param_0]; ld.param.u64 %rd4, [reduce_max_mainf_param_1]; ld.param.u64 %rd5, [reduce_max_mainf_param_2]; ld.param.u32 %r10, [reduce_max_mainf_param_3]; ld.param.u32 %r11, [reduce_max_mainf_param_4]; mov.u32 %r13, %tid.x; mov.u32 %r14, %ctaid.x; mad.lo.s32 %r33, %r14, %r11, %r13; mov.f32 %f72, 0f00000000; mov.f32 %f73, %f72; setp.ge.u32 %p1, %r33, %r10; @%p1 bra BB17_4; BB17_1: mov.f32 %f1, %f73; mov.u32 %r2, %r37; cvta.to.global.u64 %rd6, %rd3; mul.wide.u32 %rd7, %r33, 4; add.s64 %rd8, %rd6, %rd7; ld.global.f32 %f27, [%rd8]; setp.lt.f32 %p2, %f1, %f27; selp.f32 %f74, %f27, %f1, %p2; selp.b32 %r38, %r33, %r2, %p2; add.s32 %r5, %r33, %r11; setp.ge.u32 %p3, %r5, %r10; @%p3 bra BB17_3; mul.wide.u32 %rd10, %r5, 4; add.s64 %rd11, %rd6, %rd10; ld.global.f32 %f28, [%rd11]; setp.lt.f32 %p4, %f74, %f28; selp.f32 %f74, %f28, %f74, %p4; selp.b32 %r38, %r5, %r38, %p4; BB17_3: mov.f32 %f73, %f74; mov.u32 %r37, %r38; mov.u32 %r15, %nctaid.x; mad.lo.s32 %r33, %r15, %r11, %r33; setp.lt.u32 %p5, %r33, %r10; mov.u32 %r36, %r37; mov.f32 %f72, %f73; @%p5 bra BB17_1; BB17_4: mov.f32 %f70, %f72; mul.wide.u32 %rd12, %r13, 4; mov.u64 %rd13, reduce_max_mainf$__cuda_local_var_45285_32_non_const_sdata; add.s64 %rd1, %rd13, %rd12; st.shared.f32 [%rd1], %f70; mov.u64 %rd14, reduce_max_mainf$__cuda_local_var_45286_30_non_const_idxData; add.s64 %rd2, %rd14, %rd12; st.shared.u32 [%rd2], %r36; bar.sync 0; setp.lt.s32 %p6, %r11, 512; @%p6 bra BB17_9; setp.gt.u32 %p7, %r13, 255; mov.f32 %f71, %f70; @%p7 bra BB17_8; ld.shared.f32 %f6, [%rd1+1024]; setp.geu.f32 %p8, %f70, %f6; mov.f32 %f41, %f70; mov.f32 %f71, %f41; @%p8 bra BB17_8; st.shared.f32 [%rd1], %f6; ld.shared.u32 %r18, [%rd2+1024]; st.shared.u32 [%rd2], %r18; mov.f32 %f71, %f6; BB17_8: mov.f32 %f70, %f71; bar.sync 0; BB17_9: mov.f32 %f68, %f70; setp.lt.s32 %p9, %r11, 256; @%p9 bra BB17_14; setp.gt.u32 %p10, %r13, 127; mov.f32 %f69, %f68; @%p10 bra BB17_13; ld.shared.f32 %f9, [%rd1+512]; setp.geu.f32 %p11, %f68, %f9; mov.f32 %f45, %f68; mov.f32 %f69, %f45; @%p11 bra BB17_13; st.shared.f32 [%rd1], %f9; ld.shared.u32 %r20, [%rd2+512]; st.shared.u32 [%rd2], %r20; mov.f32 %f69, %f9; BB17_13: mov.f32 %f68, %f69; bar.sync 0; BB17_14: mov.f32 %f66, %f68; setp.lt.s32 %p12, %r11, 128; @%p12 bra BB17_19; setp.gt.u32 %p13, %r13, 63; mov.f32 %f67, %f66; @%p13 bra BB17_18; ld.shared.f32 %f12, [%rd1+256]; setp.geu.f32 %p14, %f66, %f12; mov.f32 %f49, %f66; mov.f32 %f67, %f49; @%p14 bra BB17_18; st.shared.f32 [%rd1], %f12; ld.shared.u32 %r22, [%rd2+256]; st.shared.u32 [%rd2], %r22; mov.f32 %f67, %f12; BB17_18: mov.f32 %f66, %f67; bar.sync 0; BB17_19: mov.f32 %f65, %f66; setp.gt.u32 %p15, %r13, 31; @%p15 bra BB17_39; setp.lt.s32 %p16, %r11, 64; @%p16 bra BB17_23; ld.volatile.shared.f32 %f29, [%rd1+128]; setp.geu.f32 %p17, %f65, %f29; @%p17 bra BB17_23; ld.volatile.shared.f32 %f65, [%rd1+128]; st.volatile.shared.f32 [%rd1], %f65; ld.volatile.shared.u32 %r24, [%rd2+128]; st.volatile.shared.u32 [%rd2], %r24; BB17_23: mov.f32 %f64, %f65; setp.lt.s32 %p18, %r11, 32; @%p18 bra BB17_26; ld.volatile.shared.f32 %f30, [%rd1+64]; setp.geu.f32 %p19, %f64, %f30; @%p19 bra BB17_26; ld.volatile.shared.f32 %f64, [%rd1+64]; st.volatile.shared.f32 [%rd1], %f64; ld.volatile.shared.u32 %r25, [%rd2+64]; st.volatile.shared.u32 [%rd2], %r25; BB17_26: mov.f32 %f63, %f64; setp.lt.s32 %p20, %r11, 16; @%p20 bra BB17_29; ld.volatile.shared.f32 %f31, [%rd1+32]; setp.geu.f32 %p21, %f63, %f31; @%p21 bra BB17_29; ld.volatile.shared.f32 %f63, [%rd1+32]; st.volatile.shared.f32 [%rd1], %f63; ld.volatile.shared.u32 %r26, [%rd2+32]; st.volatile.shared.u32 [%rd2], %r26; BB17_29: mov.f32 %f62, %f63; setp.lt.s32 %p22, %r11, 8; @%p22 bra BB17_32; ld.volatile.shared.f32 %f32, [%rd1+16]; setp.geu.f32 %p23, %f62, %f32; @%p23 bra BB17_32; ld.volatile.shared.f32 %f62, [%rd1+16]; st.volatile.shared.f32 [%rd1], %f62; ld.volatile.shared.u32 %r27, [%rd2+16]; st.volatile.shared.u32 [%rd2], %r27; BB17_32: mov.f32 %f61, %f62; setp.lt.s32 %p24, %r11, 4; @%p24 bra BB17_35; ld.volatile.shared.f32 %f33, [%rd1+8]; setp.geu.f32 %p25, %f61, %f33; @%p25 bra BB17_35; ld.volatile.shared.f32 %f61, [%rd1+8]; st.volatile.shared.f32 [%rd1], %f61; ld.volatile.shared.u32 %r28, [%rd2+8]; st.volatile.shared.u32 [%rd2], %r28; BB17_35: setp.lt.s32 %p26, %r11, 2; @%p26 bra BB17_38; ld.volatile.shared.f32 %f34, [%rd1+4]; setp.geu.f32 %p27, %f61, %f34; @%p27 bra BB17_38; ld.volatile.shared.f32 %f35, [%rd1+4]; st.volatile.shared.f32 [%rd1], %f35; ld.volatile.shared.u32 %r29, [%rd2+4]; st.volatile.shared.u32 [%rd2], %r29; BB17_38: bar.sync 0; BB17_39: setp.ne.s32 %p28, %r13, 0; @%p28 bra BB17_41; ld.shared.f32 %f36, [reduce_max_mainf$__cuda_local_var_45285_32_non_const_sdata]; cvta.to.global.u64 %rd15, %rd4; mul.wide.u32 %rd16, %r14, 4; add.s64 %rd17, %rd15, %rd16; st.global.f32 [%rd17], %f36; ld.shared.u32 %r32, [reduce_max_mainf$__cuda_local_var_45286_30_non_const_idxData]; cvta.to.global.u64 %rd18, %rd5; add.s64 %rd19, %rd18, %rd16; st.global.u32 [%rd19], %r32; BB17_41: ret; } // .globl reduce_max_filter_finalf .visible .entry reduce_max_filter_finalf( .param .u64 reduce_max_filter_finalf_param_0, .param .u64 reduce_max_filter_finalf_param_1, .param .u64 reduce_max_filter_finalf_param_2, .param .u32 reduce_max_filter_finalf_param_3, .param .u32 reduce_max_filter_finalf_param_4, .param .u32 reduce_max_filter_finalf_param_5, .param .u64 reduce_max_filter_finalf_param_6, .param .u32 reduce_max_filter_finalf_param_7 ) { .reg .pred %p<75>; .reg .f32 %f<90>; .reg .b32 %r<157>; .reg .b64 %rd<118>; // demoted variable .shared .align 4 .b8 reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow[40]; // demoted variable .shared .align 4 .b8 reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol[40]; // demoted variable .shared .align 4 .b8 reduce_max_filter_finalf$__cuda_local_var_45427_30_non_const_smaxesVal[40]; // demoted variable .shared .align 4 .b8 reduce_max_filter_finalf$__cuda_local_var_45428_32_non_const_sdata[1024]; // demoted variable .shared .align 4 .b8 reduce_max_filter_finalf$__cuda_local_var_45429_30_non_const_idxData[1024]; ld.param.u64 %rd16, [reduce_max_filter_finalf_param_0]; ld.param.u64 %rd17, [reduce_max_filter_finalf_param_1]; ld.param.u64 %rd18, [reduce_max_filter_finalf_param_2]; ld.param.u32 %r59, [reduce_max_filter_finalf_param_3]; ld.param.u32 %r60, [reduce_max_filter_finalf_param_4]; ld.param.u32 %r61, [reduce_max_filter_finalf_param_5]; ld.param.u64 %rd19, [reduce_max_filter_finalf_param_6]; ld.param.u32 %r62, [reduce_max_filter_finalf_param_7]; mov.u32 %r63, %tid.x; setp.ge.u32 %p1, %r63, %r62; @%p1 bra BB18_2; cvta.to.global.u64 %rd20, %rd19; mul.wide.u32 %rd21, %r63, 4; add.s64 %rd22, %rd20, %rd21; ld.global.u32 %r65, [%rd22]; mov.u64 %rd23, reduce_max_filter_finalf$__cuda_local_var_45427_30_non_const_smaxesVal; add.s64 %rd24, %rd23, %rd21; st.shared.u32 [%rd24], %r65; div.u32 %r66, %r65, %r60; mov.u64 %rd25, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd26, %rd25, %rd21; st.shared.u32 [%rd26], %r66; rem.u32 %r67, %r65, %r60; mov.u64 %rd27, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd28, %rd27, %rd21; st.shared.u32 [%rd28], %r67; BB18_2: bar.sync 0; shl.b32 %r69, %r61, 1; mov.u32 %r70, %ctaid.x; mad.lo.s32 %r132, %r70, %r69, %r63; mov.f32 %f86, 0f00000000; setp.ge.u32 %p2, %r132, %r59; @%p2 bra BB18_16; mov.f32 %f89, 0f00000000; BB18_4: mov.f32 %f71, %f89; mov.f32 %f1, %f71; mov.u32 %r140, %r147; mov.u32 %r2, %r140; cvta.to.global.u64 %rd29, %rd16; cvt.u64.u32 %rd1, %r132; mul.wide.u32 %rd30, %r132, 4; add.s64 %rd31, %rd29, %rd30; ld.global.f32 %f2, [%rd31]; setp.geu.f32 %p3, %f1, %f2; mov.u32 %r145, %r2; mov.f32 %f87, %f1; @%p3 bra BB18_9; cvta.to.global.u64 %rd32, %rd18; shl.b64 %rd33, %rd1, 2; add.s64 %rd34, %rd32, %rd33; ld.global.u32 %r4, [%rd34]; div.s32 %r5, %r4, %r60; rem.s32 %r6, %r4, %r60; mov.u32 %r133, 0; setp.lt.s32 %p4, %r62, 1; mov.u32 %r145, %r4; mov.f32 %f87, %f2; @%p4 bra BB18_9; BB18_6: cvt.s64.s32 %rd2, %r133; mul.wide.s32 %rd35, %r133, 4; mov.u64 %rd36, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd37, %rd36, %rd35; ld.shared.u32 %r78, [%rd37]; setp.ne.s32 %p5, %r78, %r5; @%p5 bra BB18_8; shl.b64 %rd38, %rd2, 2; mov.u64 %rd39, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd40, %rd39, %rd38; ld.shared.u32 %r79, [%rd40]; setp.eq.s32 %p6, %r79, %r6; mov.u32 %r141, %r2; mov.u32 %r145, %r141; mov.f32 %f72, %f1; mov.f32 %f87, %f72; @%p6 bra BB18_9; BB18_8: add.s32 %r133, %r133, 1; setp.lt.s32 %p7, %r133, %r62; mov.u32 %r135, %r4; mov.u32 %r145, %r135; mov.f32 %f36, %f2; mov.f32 %f87, %f36; @%p7 bra BB18_6; BB18_9: mov.f32 %f3, %f87; mov.u32 %r9, %r145; add.s32 %r10, %r132, %r61; setp.ge.u32 %p8, %r10, %r59; mov.u32 %r146, %r9; mov.f32 %f88, %f3; @%p8 bra BB18_15; cvt.u64.u32 %rd3, %r10; mul.wide.u32 %rd42, %r10, 4; add.s64 %rd43, %rd29, %rd42; ld.global.f32 %f4, [%rd43]; setp.geu.f32 %p9, %f3, %f4; mov.u32 %r138, %r9; mov.u32 %r146, %r138; mov.f32 %f69, %f3; mov.f32 %f88, %f69; @%p9 bra BB18_15; cvta.to.global.u64 %rd44, %rd18; shl.b64 %rd45, %rd3, 2; add.s64 %rd46, %rd44, %rd45; ld.global.u32 %r11, [%rd46]; div.s32 %r12, %r11, %r60; rem.s32 %r13, %r11, %r60; mov.u32 %r134, 0; setp.lt.s32 %p10, %r62, 1; mov.u32 %r146, %r11; mov.f32 %f88, %f4; @%p10 bra BB18_15; BB18_12: cvt.s64.s32 %rd4, %r134; mul.wide.s32 %rd47, %r134, 4; mov.u64 %rd48, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd49, %rd48, %rd47; ld.shared.u32 %r81, [%rd49]; setp.ne.s32 %p11, %r81, %r12; @%p11 bra BB18_14; shl.b64 %rd50, %rd4, 2; mov.u64 %rd51, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd52, %rd51, %rd50; ld.shared.u32 %r82, [%rd52]; setp.eq.s32 %p12, %r82, %r13; mov.u32 %r139, %r9; mov.u32 %r146, %r139; mov.f32 %f70, %f3; mov.f32 %f88, %f70; @%p12 bra BB18_15; BB18_14: add.s32 %r134, %r134, 1; setp.lt.s32 %p13, %r134, %r62; mov.u32 %r136, %r11; mov.u32 %r146, %r136; mov.f32 %f37, %f4; mov.f32 %f88, %f37; @%p13 bra BB18_12; BB18_15: mov.f32 %f89, %f88; mov.u32 %r147, %r146; mov.u32 %r84, %nctaid.x; mad.lo.s32 %r132, %r84, %r69, %r132; setp.lt.u32 %p14, %r132, %r59; mov.u32 %r144, %r147; mov.f32 %f86, %f89; @%p14 bra BB18_4; BB18_16: mov.f32 %f84, %f86; mul.wide.u32 %rd53, %r63, 4; mov.u64 %rd54, reduce_max_filter_finalf$__cuda_local_var_45428_32_non_const_sdata; add.s64 %rd5, %rd54, %rd53; st.shared.f32 [%rd5], %f84; mov.u64 %rd55, reduce_max_filter_finalf$__cuda_local_var_45429_30_non_const_idxData; add.s64 %rd6, %rd55, %rd53; st.shared.u32 [%rd6], %r144; bar.sync 0; setp.lt.s32 %p15, %r61, 512; @%p15 bra BB18_25; setp.gt.u32 %p16, %r63, 255; mov.f32 %f85, %f84; @%p16 bra BB18_24; ld.shared.f32 %f7, [%rd5+1024]; setp.geu.f32 %p17, %f84, %f7; mov.f32 %f66, %f84; mov.f32 %f85, %f66; @%p17 bra BB18_24; ld.shared.u32 %r19, [%rd6+1024]; div.s32 %r20, %r19, %r60; rem.s32 %r21, %r19, %r60; mov.u32 %r148, 0; setp.lt.s32 %p18, %r62, 1; @%p18 bra BB18_23; BB18_20: cvt.s64.s32 %rd7, %r148; mul.wide.s32 %rd56, %r148, 4; mov.u64 %rd57, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd58, %rd57, %rd56; ld.shared.u32 %r88, [%rd58]; setp.ne.s32 %p19, %r88, %r20; @%p19 bra BB18_22; shl.b64 %rd59, %rd7, 2; mov.u64 %rd60, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd61, %rd60, %rd59; ld.shared.u32 %r89, [%rd61]; setp.eq.s32 %p20, %r89, %r21; mov.f32 %f67, %f84; mov.f32 %f85, %f67; @%p20 bra BB18_24; BB18_22: add.s32 %r148, %r148, 1; setp.lt.s32 %p21, %r148, %r62; @%p21 bra BB18_20; BB18_23: st.shared.f32 [%rd5], %f7; st.shared.u32 [%rd6], %r19; mov.f32 %f85, %f7; BB18_24: mov.f32 %f84, %f85; bar.sync 0; BB18_25: mov.f32 %f82, %f84; setp.lt.s32 %p22, %r61, 256; @%p22 bra BB18_34; setp.gt.u32 %p23, %r63, 127; mov.f32 %f83, %f82; @%p23 bra BB18_33; ld.shared.f32 %f10, [%rd5+512]; setp.geu.f32 %p24, %f82, %f10; mov.f32 %f61, %f82; mov.f32 %f83, %f61; @%p24 bra BB18_33; ld.shared.u32 %r24, [%rd6+512]; div.s32 %r25, %r24, %r60; rem.s32 %r26, %r24, %r60; mov.u32 %r149, 0; setp.lt.s32 %p25, %r62, 1; @%p25 bra BB18_32; BB18_29: cvt.s64.s32 %rd8, %r149; mul.wide.s32 %rd62, %r149, 4; mov.u64 %rd63, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd64, %rd63, %rd62; ld.shared.u32 %r92, [%rd64]; setp.ne.s32 %p26, %r92, %r25; @%p26 bra BB18_31; shl.b64 %rd65, %rd8, 2; mov.u64 %rd66, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd67, %rd66, %rd65; ld.shared.u32 %r93, [%rd67]; setp.eq.s32 %p27, %r93, %r26; mov.f32 %f62, %f82; mov.f32 %f83, %f62; @%p27 bra BB18_33; BB18_31: add.s32 %r149, %r149, 1; setp.lt.s32 %p28, %r149, %r62; @%p28 bra BB18_29; BB18_32: st.shared.f32 [%rd5], %f10; st.shared.u32 [%rd6], %r24; mov.f32 %f83, %f10; BB18_33: mov.f32 %f82, %f83; bar.sync 0; BB18_34: mov.f32 %f80, %f82; setp.lt.s32 %p29, %r61, 128; @%p29 bra BB18_43; setp.gt.u32 %p30, %r63, 63; mov.f32 %f81, %f80; @%p30 bra BB18_42; ld.shared.f32 %f13, [%rd5+256]; setp.geu.f32 %p31, %f80, %f13; mov.f32 %f56, %f80; mov.f32 %f81, %f56; @%p31 bra BB18_42; ld.shared.u32 %r29, [%rd6+256]; div.s32 %r30, %r29, %r60; rem.s32 %r31, %r29, %r60; mov.u32 %r150, 0; setp.lt.s32 %p32, %r62, 1; @%p32 bra BB18_41; BB18_38: cvt.s64.s32 %rd9, %r150; mul.wide.s32 %rd68, %r150, 4; mov.u64 %rd69, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd70, %rd69, %rd68; ld.shared.u32 %r96, [%rd70]; setp.ne.s32 %p33, %r96, %r30; @%p33 bra BB18_40; shl.b64 %rd71, %rd9, 2; mov.u64 %rd72, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd73, %rd72, %rd71; ld.shared.u32 %r97, [%rd73]; setp.eq.s32 %p34, %r97, %r31; mov.f32 %f57, %f80; mov.f32 %f81, %f57; @%p34 bra BB18_42; BB18_40: add.s32 %r150, %r150, 1; setp.lt.s32 %p35, %r150, %r62; @%p35 bra BB18_38; BB18_41: st.shared.f32 [%rd5], %f13; st.shared.u32 [%rd6], %r29; mov.f32 %f81, %f13; BB18_42: mov.f32 %f80, %f81; bar.sync 0; BB18_43: mov.f32 %f15, %f80; setp.gt.u32 %p36, %r63, 31; @%p36 bra BB18_87; setp.lt.s32 %p37, %r61, 64; mov.f32 %f79, %f15; @%p37 bra BB18_51; ld.volatile.shared.f32 %f28, [%rd5+128]; setp.geu.f32 %p38, %f15, %f28; mov.f32 %f51, %f15; mov.f32 %f79, %f51; @%p38 bra BB18_51; ld.volatile.shared.u32 %r100, [%rd6+128]; div.s32 %r34, %r100, %r60; rem.s32 %r35, %r100, %r60; mov.u32 %r151, 0; setp.lt.s32 %p39, %r62, 1; @%p39 bra BB18_50; BB18_47: cvt.s64.s32 %rd10, %r151; mul.wide.s32 %rd74, %r151, 4; mov.u64 %rd75, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd76, %rd75, %rd74; ld.volatile.shared.u32 %r101, [%rd76]; setp.ne.s32 %p40, %r101, %r34; @%p40 bra BB18_49; shl.b64 %rd77, %rd10, 2; mov.u64 %rd78, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd79, %rd78, %rd77; ld.volatile.shared.u32 %r102, [%rd79]; setp.eq.s32 %p41, %r102, %r35; mov.f32 %f52, %f15; mov.f32 %f79, %f52; @%p41 bra BB18_51; BB18_49: add.s32 %r151, %r151, 1; setp.lt.s32 %p42, %r151, %r62; @%p42 bra BB18_47; BB18_50: ld.volatile.shared.f32 %f79, [%rd5+128]; st.volatile.shared.f32 [%rd5], %f79; ld.volatile.shared.u32 %r103, [%rd6+128]; st.volatile.shared.u32 [%rd6], %r103; BB18_51: mov.f32 %f17, %f79; setp.lt.s32 %p43, %r61, 32; mov.f32 %f78, %f17; @%p43 bra BB18_58; ld.volatile.shared.f32 %f29, [%rd5+64]; setp.geu.f32 %p44, %f17, %f29; mov.f32 %f48, %f17; mov.f32 %f78, %f48; @%p44 bra BB18_58; ld.volatile.shared.u32 %r105, [%rd6+64]; div.s32 %r38, %r105, %r60; rem.s32 %r39, %r105, %r60; mov.u32 %r152, 0; setp.lt.s32 %p45, %r62, 1; @%p45 bra BB18_57; BB18_54: cvt.s64.s32 %rd11, %r152; mul.wide.s32 %rd80, %r152, 4; mov.u64 %rd81, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd82, %rd81, %rd80; ld.volatile.shared.u32 %r106, [%rd82]; setp.ne.s32 %p46, %r106, %r38; @%p46 bra BB18_56; shl.b64 %rd83, %rd11, 2; mov.u64 %rd84, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd85, %rd84, %rd83; ld.volatile.shared.u32 %r107, [%rd85]; setp.eq.s32 %p47, %r107, %r39; mov.f32 %f49, %f17; mov.f32 %f78, %f49; @%p47 bra BB18_58; BB18_56: add.s32 %r152, %r152, 1; setp.lt.s32 %p48, %r152, %r62; @%p48 bra BB18_54; BB18_57: ld.volatile.shared.f32 %f78, [%rd5+64]; st.volatile.shared.f32 [%rd5], %f78; ld.volatile.shared.u32 %r108, [%rd6+64]; st.volatile.shared.u32 [%rd6], %r108; BB18_58: mov.f32 %f19, %f78; setp.lt.s32 %p49, %r61, 16; mov.f32 %f77, %f19; @%p49 bra BB18_65; ld.volatile.shared.f32 %f30, [%rd5+32]; setp.geu.f32 %p50, %f19, %f30; mov.f32 %f45, %f19; mov.f32 %f77, %f45; @%p50 bra BB18_65; ld.volatile.shared.u32 %r110, [%rd6+32]; div.s32 %r42, %r110, %r60; rem.s32 %r43, %r110, %r60; mov.u32 %r153, 0; setp.lt.s32 %p51, %r62, 1; @%p51 bra BB18_64; BB18_61: cvt.s64.s32 %rd12, %r153; mul.wide.s32 %rd86, %r153, 4; mov.u64 %rd87, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd88, %rd87, %rd86; ld.volatile.shared.u32 %r111, [%rd88]; setp.ne.s32 %p52, %r111, %r42; @%p52 bra BB18_63; shl.b64 %rd89, %rd12, 2; mov.u64 %rd90, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd91, %rd90, %rd89; ld.volatile.shared.u32 %r112, [%rd91]; setp.eq.s32 %p53, %r112, %r43; mov.f32 %f46, %f19; mov.f32 %f77, %f46; @%p53 bra BB18_65; BB18_63: add.s32 %r153, %r153, 1; setp.lt.s32 %p54, %r153, %r62; @%p54 bra BB18_61; BB18_64: ld.volatile.shared.f32 %f77, [%rd5+32]; st.volatile.shared.f32 [%rd5], %f77; ld.volatile.shared.u32 %r113, [%rd6+32]; st.volatile.shared.u32 [%rd6], %r113; BB18_65: mov.f32 %f21, %f77; setp.lt.s32 %p55, %r61, 8; mov.f32 %f76, %f21; @%p55 bra BB18_72; ld.volatile.shared.f32 %f31, [%rd5+16]; setp.geu.f32 %p56, %f21, %f31; mov.f32 %f42, %f21; mov.f32 %f76, %f42; @%p56 bra BB18_72; ld.volatile.shared.u32 %r115, [%rd6+16]; div.s32 %r46, %r115, %r60; rem.s32 %r47, %r115, %r60; mov.u32 %r154, 0; setp.lt.s32 %p57, %r62, 1; @%p57 bra BB18_71; BB18_68: cvt.s64.s32 %rd13, %r154; mul.wide.s32 %rd92, %r154, 4; mov.u64 %rd93, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd94, %rd93, %rd92; ld.volatile.shared.u32 %r116, [%rd94]; setp.ne.s32 %p58, %r116, %r46; @%p58 bra BB18_70; shl.b64 %rd95, %rd13, 2; mov.u64 %rd96, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd97, %rd96, %rd95; ld.volatile.shared.u32 %r117, [%rd97]; setp.eq.s32 %p59, %r117, %r47; mov.f32 %f43, %f21; mov.f32 %f76, %f43; @%p59 bra BB18_72; BB18_70: add.s32 %r154, %r154, 1; setp.lt.s32 %p60, %r154, %r62; @%p60 bra BB18_68; BB18_71: ld.volatile.shared.f32 %f76, [%rd5+16]; st.volatile.shared.f32 [%rd5], %f76; ld.volatile.shared.u32 %r118, [%rd6+16]; st.volatile.shared.u32 [%rd6], %r118; BB18_72: mov.f32 %f23, %f76; setp.lt.s32 %p61, %r61, 4; mov.f32 %f75, %f23; @%p61 bra BB18_79; ld.volatile.shared.f32 %f32, [%rd5+8]; setp.geu.f32 %p62, %f23, %f32; mov.f32 %f39, %f23; mov.f32 %f75, %f39; @%p62 bra BB18_79; ld.volatile.shared.u32 %r120, [%rd6+8]; div.s32 %r50, %r120, %r60; rem.s32 %r51, %r120, %r60; mov.u32 %r155, 0; setp.lt.s32 %p63, %r62, 1; @%p63 bra BB18_78; BB18_75: cvt.s64.s32 %rd14, %r155; mul.wide.s32 %rd98, %r155, 4; mov.u64 %rd99, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd100, %rd99, %rd98; ld.volatile.shared.u32 %r121, [%rd100]; setp.ne.s32 %p64, %r121, %r50; @%p64 bra BB18_77; shl.b64 %rd101, %rd14, 2; mov.u64 %rd102, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd103, %rd102, %rd101; ld.volatile.shared.u32 %r122, [%rd103]; setp.eq.s32 %p65, %r122, %r51; mov.f32 %f40, %f23; mov.f32 %f75, %f40; @%p65 bra BB18_79; BB18_77: add.s32 %r155, %r155, 1; setp.lt.s32 %p66, %r155, %r62; @%p66 bra BB18_75; BB18_78: ld.volatile.shared.f32 %f75, [%rd5+8]; st.volatile.shared.f32 [%rd5], %f75; ld.volatile.shared.u32 %r123, [%rd6+8]; st.volatile.shared.u32 [%rd6], %r123; BB18_79: setp.lt.s32 %p67, %r61, 2; @%p67 bra BB18_86; ld.volatile.shared.f32 %f33, [%rd5+4]; setp.geu.f32 %p68, %f75, %f33; @%p68 bra BB18_86; ld.volatile.shared.u32 %r125, [%rd6+4]; div.s32 %r54, %r125, %r60; rem.s32 %r55, %r125, %r60; mov.u32 %r156, 0; setp.lt.s32 %p69, %r62, 1; @%p69 bra BB18_85; BB18_82: cvt.s64.s32 %rd15, %r156; mul.wide.s32 %rd104, %r156, 4; mov.u64 %rd105, reduce_max_filter_finalf$__cuda_local_var_45425_30_non_const_smaxesRow; add.s64 %rd106, %rd105, %rd104; ld.volatile.shared.u32 %r126, [%rd106]; setp.ne.s32 %p70, %r126, %r54; @%p70 bra BB18_84; shl.b64 %rd107, %rd15, 2; mov.u64 %rd108, reduce_max_filter_finalf$__cuda_local_var_45426_30_non_const_smaxesCol; add.s64 %rd109, %rd108, %rd107; ld.volatile.shared.u32 %r127, [%rd109]; setp.eq.s32 %p71, %r127, %r55; @%p71 bra BB18_86; BB18_84: add.s32 %r156, %r156, 1; setp.lt.s32 %p72, %r156, %r62; @%p72 bra BB18_82; BB18_85: ld.volatile.shared.f32 %f34, [%rd5+4]; st.volatile.shared.f32 [%rd5], %f34; ld.volatile.shared.u32 %r128, [%rd6+4]; st.volatile.shared.u32 [%rd6], %r128; BB18_86: bar.sync 0; BB18_87: setp.ne.s32 %p73, %r63, 0; @%p73 bra BB18_90; ld.shared.f32 %f35, [reduce_max_filter_finalf$__cuda_local_var_45428_32_non_const_sdata]; cvta.to.global.u64 %rd110, %rd17; mul.wide.u32 %rd111, %r70, 4; add.s64 %rd112, %rd110, %rd111; st.global.f32 [%rd112], %f35; ld.shared.u32 %r58, [reduce_max_filter_finalf$__cuda_local_var_45429_30_non_const_idxData]; cvta.to.global.u64 %rd113, %rd18; add.s64 %rd114, %rd113, %rd111; st.global.u32 [%rd114], %r58; mov.u32 %r131, %nctaid.x; setp.ne.s32 %p74, %r131, 1; @%p74 bra BB18_90; cvta.to.global.u64 %rd115, %rd19; mul.wide.s32 %rd116, %r62, 4; add.s64 %rd117, %rd115, %rd116; st.global.u32 [%rd117], %r58; BB18_90: ret; } // .globl reduce_max_filter_mainf .visible .entry reduce_max_filter_mainf( .param .u64 reduce_max_filter_mainf_param_0, .param .u64 reduce_max_filter_mainf_param_1, .param .u64 reduce_max_filter_mainf_param_2, .param .u32 reduce_max_filter_mainf_param_3, .param .u32 reduce_max_filter_mainf_param_4, .param .u32 reduce_max_filter_mainf_param_5, .param .u64 reduce_max_filter_mainf_param_6, .param .u32 reduce_max_filter_mainf_param_7 ) { .reg .pred %p<87>; .reg .f32 %f<88>; .reg .b32 %r<147>; .reg .b64 %rd<110>; // demoted variable .shared .align 4 .b8 reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow[40]; // demoted variable .shared .align 4 .b8 reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol[40]; // demoted variable .shared .align 4 .b8 reduce_max_filter_mainf$__cuda_local_var_45638_30_non_const_smaxesVal[40]; // demoted variable .shared .align 4 .b8 reduce_max_filter_mainf$__cuda_local_var_45639_32_non_const_sdata[1024]; // demoted variable .shared .align 4 .b8 reduce_max_filter_mainf$__cuda_local_var_45640_30_non_const_idxData[1024]; ld.param.u64 %rd14, [reduce_max_filter_mainf_param_0]; ld.param.u64 %rd15, [reduce_max_filter_mainf_param_1]; ld.param.u64 %rd16, [reduce_max_filter_mainf_param_2]; ld.param.u32 %r60, [reduce_max_filter_mainf_param_3]; ld.param.u32 %r61, [reduce_max_filter_mainf_param_4]; ld.param.u32 %r62, [reduce_max_filter_mainf_param_5]; ld.param.u64 %rd17, [reduce_max_filter_mainf_param_6]; ld.param.u32 %r63, [reduce_max_filter_mainf_param_7]; mov.u32 %r1, %tid.x; setp.ge.u32 %p3, %r1, %r63; @%p3 bra BB19_2; cvta.to.global.u64 %rd18, %rd17; mul.wide.u32 %rd19, %r1, 4; add.s64 %rd20, %rd18, %rd19; ld.global.u32 %r65, [%rd20]; mov.u64 %rd21, reduce_max_filter_mainf$__cuda_local_var_45638_30_non_const_smaxesVal; add.s64 %rd22, %rd21, %rd19; st.shared.u32 [%rd22], %r65; div.u32 %r66, %r65, %r60; mov.u64 %rd23, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd24, %rd23, %rd19; st.shared.u32 [%rd24], %r66; rem.u32 %r67, %r65, %r60; mov.u64 %rd25, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd26, %rd25, %rd19; st.shared.u32 [%rd26], %r67; BB19_2: mov.u32 %r68, %ctaid.x; mad.lo.s32 %r134, %r68, %r62, %r1; bar.sync 0; mul.lo.s32 %r70, %r61, %r60; mov.f32 %f84, 0fFF800000; setp.ge.u32 %p4, %r134, %r70; @%p4 bra BB19_18; mov.f32 %f87, 0fFF800000; BB19_4: mov.f32 %f70, %f87; mov.f32 %f85, %f70; cvta.to.global.u64 %rd27, %rd14; mul.wide.u32 %rd28, %r134, 4; add.s64 %rd29, %rd27, %rd28; ld.global.f32 %f2, [%rd29]; setp.geu.f32 %p5, %f85, %f2; @%p5 bra BB19_10; div.s32 %r6, %r134, %r60; rem.s32 %r7, %r134, %r60; mov.pred %p6, -1; mov.u32 %r135, 0; setp.lt.s32 %p7, %r63, 1; mov.pred %p84, %p6; @%p7 bra BB19_9; BB19_6: cvt.s64.s32 %rd1, %r135; mul.wide.s32 %rd30, %r135, 4; mov.u64 %rd31, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd32, %rd31, %rd30; ld.shared.u32 %r75, [%rd32]; setp.ne.s32 %p8, %r75, %r6; @%p8 bra BB19_8; shl.b64 %rd33, %rd1, 2; mov.u64 %rd34, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd35, %rd34, %rd33; ld.shared.u32 %r76, [%rd35]; setp.eq.s32 %p10, %r76, %r7; mov.pred %p9, 0; mov.pred %p84, %p9; @%p10 bra BB19_9; BB19_8: add.s32 %r135, %r135, 1; setp.lt.s32 %p12, %r135, %r63; mov.pred %p83, %p6; mov.pred %p84, %p83; @%p12 bra BB19_6; BB19_9: selp.f32 %f85, %f2, %f85, %p84; selp.b32 %r137, %r134, %r137, %p84; BB19_10: mov.f32 %f4, %f85; add.s32 %r78, %r134, %r62; setp.ge.u32 %p13, %r78, %r70; mov.f32 %f86, %f4; @%p13 bra BB19_17; mul.wide.u32 %rd37, %r78, 4; add.s64 %rd38, %rd27, %rd37; ld.global.f32 %f5, [%rd38]; setp.geu.f32 %p14, %f4, %f5; mov.f32 %f69, %f4; mov.f32 %f86, %f69; @%p14 bra BB19_17; div.s32 %r12, %r78, %r60; rem.s32 %r13, %r78, %r60; mov.pred %p15, -1; mov.u32 %r136, 0; setp.lt.s32 %p16, %r63, 1; mov.pred %p86, %p15; @%p16 bra BB19_16; BB19_13: cvt.s64.s32 %rd2, %r136; mul.wide.s32 %rd39, %r136, 4; mov.u64 %rd40, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd41, %rd40, %rd39; ld.shared.u32 %r82, [%rd41]; setp.ne.s32 %p17, %r82, %r12; @%p17 bra BB19_15; shl.b64 %rd42, %rd2, 2; mov.u64 %rd43, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd44, %rd43, %rd42; ld.shared.u32 %r83, [%rd44]; setp.eq.s32 %p19, %r83, %r13; mov.pred %p18, 0; mov.pred %p86, %p18; @%p19 bra BB19_16; BB19_15: add.s32 %r136, %r136, 1; setp.lt.s32 %p21, %r136, %r63; mov.pred %p85, %p15; mov.pred %p86, %p85; @%p21 bra BB19_13; BB19_16: selp.f32 %f86, %f5, %f4, %p86; selp.b32 %r137, %r78, %r137, %p86; BB19_17: mov.f32 %f87, %f86; mov.u32 %r85, %nctaid.x; mad.lo.s32 %r134, %r85, %r62, %r134; setp.lt.u32 %p22, %r134, %r70; mov.f32 %f84, %f87; @%p22 bra BB19_4; BB19_18: mov.f32 %f82, %f84; mul.wide.u32 %rd45, %r1, 4; mov.u64 %rd46, reduce_max_filter_mainf$__cuda_local_var_45639_32_non_const_sdata; add.s64 %rd3, %rd46, %rd45; st.shared.f32 [%rd3], %f82; mov.u64 %rd47, reduce_max_filter_mainf$__cuda_local_var_45640_30_non_const_idxData; add.s64 %rd4, %rd47, %rd45; st.shared.u32 [%rd4], %r137; bar.sync 0; setp.lt.s32 %p23, %r62, 512; @%p23 bra BB19_27; setp.gt.u32 %p24, %r1, 255; mov.f32 %f83, %f82; @%p24 bra BB19_26; ld.shared.f32 %f9, [%rd3+1024]; setp.geu.f32 %p25, %f82, %f9; mov.f32 %f66, %f82; mov.f32 %f83, %f66; @%p25 bra BB19_26; ld.shared.u32 %r20, [%rd4+1024]; div.s32 %r21, %r20, %r60; rem.s32 %r22, %r20, %r60; mov.u32 %r138, 0; setp.lt.s32 %p26, %r63, 1; @%p26 bra BB19_25; BB19_22: cvt.s64.s32 %rd5, %r138; mul.wide.s32 %rd48, %r138, 4; mov.u64 %rd49, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd50, %rd49, %rd48; ld.shared.u32 %r90, [%rd50]; setp.ne.s32 %p27, %r90, %r21; @%p27 bra BB19_24; shl.b64 %rd51, %rd5, 2; mov.u64 %rd52, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd53, %rd52, %rd51; ld.shared.u32 %r91, [%rd53]; setp.eq.s32 %p28, %r91, %r22; mov.f32 %f67, %f82; mov.f32 %f83, %f67; @%p28 bra BB19_26; BB19_24: add.s32 %r138, %r138, 1; setp.lt.s32 %p29, %r138, %r63; @%p29 bra BB19_22; BB19_25: st.shared.f32 [%rd3], %f9; st.shared.u32 [%rd4], %r20; mov.f32 %f83, %f9; BB19_26: mov.f32 %f82, %f83; bar.sync 0; BB19_27: mov.f32 %f80, %f82; setp.lt.s32 %p30, %r62, 256; @%p30 bra BB19_36; setp.gt.u32 %p31, %r1, 127; mov.f32 %f81, %f80; @%p31 bra BB19_35; ld.shared.f32 %f12, [%rd3+512]; setp.geu.f32 %p32, %f80, %f12; mov.f32 %f61, %f80; mov.f32 %f81, %f61; @%p32 bra BB19_35; ld.shared.u32 %r25, [%rd4+512]; div.s32 %r26, %r25, %r60; rem.s32 %r27, %r25, %r60; mov.u32 %r139, 0; setp.lt.s32 %p33, %r63, 1; @%p33 bra BB19_34; BB19_31: cvt.s64.s32 %rd6, %r139; mul.wide.s32 %rd54, %r139, 4; mov.u64 %rd55, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd56, %rd55, %rd54; ld.shared.u32 %r94, [%rd56]; setp.ne.s32 %p34, %r94, %r26; @%p34 bra BB19_33; shl.b64 %rd57, %rd6, 2; mov.u64 %rd58, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd59, %rd58, %rd57; ld.shared.u32 %r95, [%rd59]; setp.eq.s32 %p35, %r95, %r27; mov.f32 %f62, %f80; mov.f32 %f81, %f62; @%p35 bra BB19_35; BB19_33: add.s32 %r139, %r139, 1; setp.lt.s32 %p36, %r139, %r63; @%p36 bra BB19_31; BB19_34: st.shared.f32 [%rd3], %f12; st.shared.u32 [%rd4], %r25; mov.f32 %f81, %f12; BB19_35: mov.f32 %f80, %f81; bar.sync 0; BB19_36: mov.f32 %f78, %f80; setp.lt.s32 %p37, %r62, 128; @%p37 bra BB19_45; setp.gt.u32 %p38, %r1, 63; mov.f32 %f79, %f78; @%p38 bra BB19_44; ld.shared.f32 %f15, [%rd3+256]; setp.geu.f32 %p39, %f78, %f15; mov.f32 %f56, %f78; mov.f32 %f79, %f56; @%p39 bra BB19_44; ld.shared.u32 %r30, [%rd4+256]; div.s32 %r31, %r30, %r60; rem.s32 %r32, %r30, %r60; mov.u32 %r140, 0; setp.lt.s32 %p40, %r63, 1; @%p40 bra BB19_43; BB19_40: cvt.s64.s32 %rd7, %r140; mul.wide.s32 %rd60, %r140, 4; mov.u64 %rd61, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd62, %rd61, %rd60; ld.shared.u32 %r98, [%rd62]; setp.ne.s32 %p41, %r98, %r31; @%p41 bra BB19_42; shl.b64 %rd63, %rd7, 2; mov.u64 %rd64, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd65, %rd64, %rd63; ld.shared.u32 %r99, [%rd65]; setp.eq.s32 %p42, %r99, %r32; mov.f32 %f57, %f78; mov.f32 %f79, %f57; @%p42 bra BB19_44; BB19_42: add.s32 %r140, %r140, 1; setp.lt.s32 %p43, %r140, %r63; @%p43 bra BB19_40; BB19_43: st.shared.f32 [%rd3], %f15; st.shared.u32 [%rd4], %r30; mov.f32 %f79, %f15; BB19_44: mov.f32 %f78, %f79; bar.sync 0; BB19_45: mov.f32 %f17, %f78; setp.gt.u32 %p44, %r1, 31; @%p44 bra BB19_89; setp.lt.s32 %p45, %r62, 64; mov.f32 %f77, %f17; @%p45 bra BB19_53; ld.volatile.shared.f32 %f30, [%rd3+128]; setp.geu.f32 %p46, %f17, %f30; mov.f32 %f51, %f17; mov.f32 %f77, %f51; @%p46 bra BB19_53; ld.volatile.shared.u32 %r102, [%rd4+128]; div.s32 %r35, %r102, %r60; rem.s32 %r36, %r102, %r60; mov.u32 %r141, 0; setp.lt.s32 %p47, %r63, 1; @%p47 bra BB19_52; BB19_49: cvt.s64.s32 %rd8, %r141; mul.wide.s32 %rd66, %r141, 4; mov.u64 %rd67, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd68, %rd67, %rd66; ld.volatile.shared.u32 %r103, [%rd68]; setp.ne.s32 %p48, %r103, %r35; @%p48 bra BB19_51; shl.b64 %rd69, %rd8, 2; mov.u64 %rd70, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd71, %rd70, %rd69; ld.volatile.shared.u32 %r104, [%rd71]; setp.eq.s32 %p49, %r104, %r36; mov.f32 %f52, %f17; mov.f32 %f77, %f52; @%p49 bra BB19_53; BB19_51: add.s32 %r141, %r141, 1; setp.lt.s32 %p50, %r141, %r63; @%p50 bra BB19_49; BB19_52: ld.volatile.shared.f32 %f77, [%rd3+128]; st.volatile.shared.f32 [%rd3], %f77; ld.volatile.shared.u32 %r105, [%rd4+128]; st.volatile.shared.u32 [%rd4], %r105; BB19_53: mov.f32 %f19, %f77; setp.lt.s32 %p51, %r62, 32; mov.f32 %f76, %f19; @%p51 bra BB19_60; ld.volatile.shared.f32 %f31, [%rd3+64]; setp.geu.f32 %p52, %f19, %f31; mov.f32 %f48, %f19; mov.f32 %f76, %f48; @%p52 bra BB19_60; ld.volatile.shared.u32 %r107, [%rd4+64]; div.s32 %r39, %r107, %r60; rem.s32 %r40, %r107, %r60; mov.u32 %r142, 0; setp.lt.s32 %p53, %r63, 1; @%p53 bra BB19_59; BB19_56: cvt.s64.s32 %rd9, %r142; mul.wide.s32 %rd72, %r142, 4; mov.u64 %rd73, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd74, %rd73, %rd72; ld.volatile.shared.u32 %r108, [%rd74]; setp.ne.s32 %p54, %r108, %r39; @%p54 bra BB19_58; shl.b64 %rd75, %rd9, 2; mov.u64 %rd76, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd77, %rd76, %rd75; ld.volatile.shared.u32 %r109, [%rd77]; setp.eq.s32 %p55, %r109, %r40; mov.f32 %f49, %f19; mov.f32 %f76, %f49; @%p55 bra BB19_60; BB19_58: add.s32 %r142, %r142, 1; setp.lt.s32 %p56, %r142, %r63; @%p56 bra BB19_56; BB19_59: ld.volatile.shared.f32 %f76, [%rd3+64]; st.volatile.shared.f32 [%rd3], %f76; ld.volatile.shared.u32 %r110, [%rd4+64]; st.volatile.shared.u32 [%rd4], %r110; BB19_60: mov.f32 %f21, %f76; setp.lt.s32 %p57, %r62, 16; mov.f32 %f75, %f21; @%p57 bra BB19_67; ld.volatile.shared.f32 %f32, [%rd3+32]; setp.geu.f32 %p58, %f21, %f32; mov.f32 %f45, %f21; mov.f32 %f75, %f45; @%p58 bra BB19_67; ld.volatile.shared.u32 %r112, [%rd4+32]; div.s32 %r43, %r112, %r60; rem.s32 %r44, %r112, %r60; mov.u32 %r143, 0; setp.lt.s32 %p59, %r63, 1; @%p59 bra BB19_66; BB19_63: cvt.s64.s32 %rd10, %r143; mul.wide.s32 %rd78, %r143, 4; mov.u64 %rd79, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd80, %rd79, %rd78; ld.volatile.shared.u32 %r113, [%rd80]; setp.ne.s32 %p60, %r113, %r43; @%p60 bra BB19_65; shl.b64 %rd81, %rd10, 2; mov.u64 %rd82, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd83, %rd82, %rd81; ld.volatile.shared.u32 %r114, [%rd83]; setp.eq.s32 %p61, %r114, %r44; mov.f32 %f46, %f21; mov.f32 %f75, %f46; @%p61 bra BB19_67; BB19_65: add.s32 %r143, %r143, 1; setp.lt.s32 %p62, %r143, %r63; @%p62 bra BB19_63; BB19_66: ld.volatile.shared.f32 %f75, [%rd3+32]; st.volatile.shared.f32 [%rd3], %f75; ld.volatile.shared.u32 %r115, [%rd4+32]; st.volatile.shared.u32 [%rd4], %r115; BB19_67: mov.f32 %f23, %f75; setp.lt.s32 %p63, %r62, 8; mov.f32 %f74, %f23; @%p63 bra BB19_74; ld.volatile.shared.f32 %f33, [%rd3+16]; setp.geu.f32 %p64, %f23, %f33; mov.f32 %f42, %f23; mov.f32 %f74, %f42; @%p64 bra BB19_74; ld.volatile.shared.u32 %r117, [%rd4+16]; div.s32 %r47, %r117, %r60; rem.s32 %r48, %r117, %r60; mov.u32 %r144, 0; setp.lt.s32 %p65, %r63, 1; @%p65 bra BB19_73; BB19_70: cvt.s64.s32 %rd11, %r144; mul.wide.s32 %rd84, %r144, 4; mov.u64 %rd85, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd86, %rd85, %rd84; ld.volatile.shared.u32 %r118, [%rd86]; setp.ne.s32 %p66, %r118, %r47; @%p66 bra BB19_72; shl.b64 %rd87, %rd11, 2; mov.u64 %rd88, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd89, %rd88, %rd87; ld.volatile.shared.u32 %r119, [%rd89]; setp.eq.s32 %p67, %r119, %r48; mov.f32 %f43, %f23; mov.f32 %f74, %f43; @%p67 bra BB19_74; BB19_72: add.s32 %r144, %r144, 1; setp.lt.s32 %p68, %r144, %r63; @%p68 bra BB19_70; BB19_73: ld.volatile.shared.f32 %f74, [%rd3+16]; st.volatile.shared.f32 [%rd3], %f74; ld.volatile.shared.u32 %r120, [%rd4+16]; st.volatile.shared.u32 [%rd4], %r120; BB19_74: mov.f32 %f25, %f74; setp.lt.s32 %p69, %r62, 4; mov.f32 %f73, %f25; @%p69 bra BB19_81; ld.volatile.shared.f32 %f34, [%rd3+8]; setp.geu.f32 %p70, %f25, %f34; mov.f32 %f39, %f25; mov.f32 %f73, %f39; @%p70 bra BB19_81; ld.volatile.shared.u32 %r122, [%rd4+8]; div.s32 %r51, %r122, %r60; rem.s32 %r52, %r122, %r60; mov.u32 %r145, 0; setp.lt.s32 %p71, %r63, 1; @%p71 bra BB19_80; BB19_77: cvt.s64.s32 %rd12, %r145; mul.wide.s32 %rd90, %r145, 4; mov.u64 %rd91, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd92, %rd91, %rd90; ld.volatile.shared.u32 %r123, [%rd92]; setp.ne.s32 %p72, %r123, %r51; @%p72 bra BB19_79; shl.b64 %rd93, %rd12, 2; mov.u64 %rd94, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd95, %rd94, %rd93; ld.volatile.shared.u32 %r124, [%rd95]; setp.eq.s32 %p73, %r124, %r52; mov.f32 %f40, %f25; mov.f32 %f73, %f40; @%p73 bra BB19_81; BB19_79: add.s32 %r145, %r145, 1; setp.lt.s32 %p74, %r145, %r63; @%p74 bra BB19_77; BB19_80: ld.volatile.shared.f32 %f73, [%rd3+8]; st.volatile.shared.f32 [%rd3], %f73; ld.volatile.shared.u32 %r125, [%rd4+8]; st.volatile.shared.u32 [%rd4], %r125; BB19_81: setp.lt.s32 %p75, %r62, 2; @%p75 bra BB19_88; ld.volatile.shared.f32 %f35, [%rd3+4]; setp.geu.f32 %p76, %f73, %f35; @%p76 bra BB19_88; ld.volatile.shared.u32 %r127, [%rd4+4]; div.s32 %r55, %r127, %r60; rem.s32 %r56, %r127, %r60; mov.u32 %r146, 0; setp.lt.s32 %p77, %r63, 1; @%p77 bra BB19_87; BB19_84: cvt.s64.s32 %rd13, %r146; mul.wide.s32 %rd96, %r146, 4; mov.u64 %rd97, reduce_max_filter_mainf$__cuda_local_var_45636_30_non_const_smaxesRow; add.s64 %rd98, %rd97, %rd96; ld.volatile.shared.u32 %r128, [%rd98]; setp.ne.s32 %p78, %r128, %r55; @%p78 bra BB19_86; shl.b64 %rd99, %rd13, 2; mov.u64 %rd100, reduce_max_filter_mainf$__cuda_local_var_45637_30_non_const_smaxesCol; add.s64 %rd101, %rd100, %rd99; ld.volatile.shared.u32 %r129, [%rd101]; setp.eq.s32 %p79, %r129, %r56; @%p79 bra BB19_88; BB19_86: add.s32 %r146, %r146, 1; setp.lt.s32 %p80, %r146, %r63; @%p80 bra BB19_84; BB19_87: ld.volatile.shared.f32 %f36, [%rd3+4]; st.volatile.shared.f32 [%rd3], %f36; ld.volatile.shared.u32 %r130, [%rd4+4]; st.volatile.shared.u32 [%rd4], %r130; BB19_88: bar.sync 0; BB19_89: setp.ne.s32 %p81, %r1, 0; @%p81 bra BB19_92; ld.shared.f32 %f37, [reduce_max_filter_mainf$__cuda_local_var_45639_32_non_const_sdata]; cvta.to.global.u64 %rd102, %rd15; mul.wide.u32 %rd103, %r68, 4; add.s64 %rd104, %rd102, %rd103; st.global.f32 [%rd104], %f37; ld.shared.u32 %r59, [reduce_max_filter_mainf$__cuda_local_var_45640_30_non_const_idxData]; cvta.to.global.u64 %rd105, %rd16; add.s64 %rd106, %rd105, %rd103; st.global.u32 [%rd106], %r59; mov.u32 %r133, %nctaid.x; setp.ne.s32 %p82, %r133, 1; @%p82 bra BB19_92; cvta.to.global.u64 %rd107, %rd17; mul.wide.s32 %rd108, %r63, 4; add.s64 %rd109, %rd107, %rd108; st.global.u32 [%rd109], %r59; BB19_92: ret; }