在本文中,我们将详细介绍SSE2Intrinsics各函数介绍的各个方面,并为您提供关于ss()函数的相关解答,同时,我们也将为您带来关于android.graphics.Paint.FontMetr
在本文中,我们将详细介绍SSE2 Intrinsics各函数介绍的各个方面,并为您提供关于ss()函数的相关解答,同时,我们也将为您带来关于android.graphics.Paint.FontMetricsInt的实例源码、ARM Neon Intrinsics 学习指北:从入门、进阶到学个通透、C
- SSE2 Intrinsics各函数介绍(ss()函数)
- android.graphics.Paint.FontMetricsInt的实例源码
- ARM Neon Intrinsics 学习指北:从入门、进阶到学个通透
- C
常用函数介绍 - cocoa – – [NSTextField intrinsicContentSize]始终具有未定义的宽度
SSE2 Intrinsics各函数介绍(ss()函数)
转自:https://blog.csdn.net/fengbingchun/article/details/18460199
SIMD相关头文件包括:
//#include <ivec.h>//MMX //#include <fvec.h>//SSE(also include ivec.h) //#include <dvec.h>//SSE2(also include fvec.h) #include <mmintrin.h> //MMX #include <xmmintrin.h> //SSE(include mmintrin.h) #include <emmintrin.h> //SSE2(include xmmintrin.h) #include <pmmintrin.h> //SSE3(include emmintrin.h) #include <tmmintrin.h>//SSSE3(include pmmintrin.h) #include <smmintrin.h>//SSE4.1(include tmmintrin.h) #include <nmmintrin.h>//SSE4.2(include smmintrin.h) #include <wmmintrin.h>//AES(include nmmintrin.h) #include <immintrin.h>//AVX(include wmmintrin.h) #include <intrin.h>//(include immintrin.h)
mmintrin.h为MMX 头文件,其中__m64的定义为:
typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64 { unsigned __int64 m64_u64; float m64_f32[2]; __int8 m64_i8[8]; __int16 m64_i16[4]; __int32 m64_i32[2]; __int64 m64_i64; unsigned __int8 m64_u8[8]; unsigned __int16 m64_u16[4]; unsigned __int32 m64_u32[2]; } __m64;
xmmintrin.h为SSE 头文件,此头文件里包含MMX头文件,其中__m128的定义为:
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 { float m128_f32[4]; unsigned __int64 m128_u64[2]; __int8 m128_i8[16]; __int16 m128_i16[8]; __int32 m128_i32[4]; __int64 m128_i64[2]; unsigned __int8 m128_u8[16]; unsigned __int16 m128_u16[8]; unsigned __int32 m128_u32[4]; } __m128;
emmintrin.h为SSE2头文件,此头文件里包含SSE头文件,其中__m128i和__m128d的定义为:
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i { __int8 m128i_i8[16]; __int16 m128i_i16[8]; __int32 m128i_i32[4]; __int64 m128i_i64[2]; unsigned __int8 m128i_u8[16]; unsigned __int16 m128i_u16[8]; unsigned __int32 m128i_u32[4]; unsigned __int64 m128i_u64[2]; } __m128i; typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d { double m128d_f64[2]; } __m128d;
emmintrin.h文件中各函数的介绍:
/*----Floating-Point Intrinsics Using Streaming SIMD Extension 2 Instructions----*/ //Arithmetic Operations(Floating Point):add、sub、mul、div、sqrt、min、max //返回一个__m128d的寄存器,r0=_A0+_B0,r1=_A1 extern __m128d _mm_add_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=_A0+_B0,r1=_A1+_B1 extern __m128d _mm_add_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=_A0-_B0,r1=_A1 extern __m128d _mm_sub_sd(__m128d _A,r1=_A1-_B1 extern __m128d _mm_sub_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=_A0*_B0,r1=_A1 extern __m128d _mm_mul_sd(__m128d _A,r1=_A1*_B1 extern __m128d _mm_mul_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=sqrt(_B0),r1=_A1 extern __m128d _mm_sqrt_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=sqrt(_A0),r1=sqrt(_A1) extern __m128d _mm_sqrt_pd(__m128d _A); //返回一个__m128d的寄存器,r0=_A0/_B0,r1=_A1 extern __m128d _mm_div_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=_A0/_B0,r1=_A1/_B1 extern __m128d _mm_div_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=min(_A0,_B0),r1=_A1 extern __m128d _mm_min_sd(__m128d _A,r1=min(_A1,_B1) extern __m128d _mm_min_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=max(_A0,r1=_A1 extern __m128d _mm_max_sd(__m128d _A,r1=max(_A1,_B1) extern __m128d _mm_max_pd(__m128d _A,__m128d _B); //Logical Operations(Floating Point SSE2 Intrinsics):and、or、xor、 andnot //返回一个__m128d的寄存器,r0=_A0 & _B0,r1=_A1 & _B1 extern __m128d _mm_and_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(~_A0) & _B0,r1=(~_A1) & _B1 extern __m128d _mm_andnot_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=_A0 | _B0,r1=_A1 | _B1 extern __m128d _mm_or_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=_A0 ^ _B0,r1=_A1 ^ _B1 extern __m128d _mm_xor_pd(__m128d _A,__m128d _B); //Comparisions:==、<、<=、>、>=、!= //返回一个__m128d的寄存器,r0=(_A0 == _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmpeq_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 == _B0) ? 0xffffffffffffffff : 0x0,//r1=(_A1 == _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmpeq_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 < _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmplt_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 < _B0) ? 0xffffffffffffffff : 0x0,//r1=(_A1 < _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmplt_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 <= _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmple_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 <= _B0) ? 0xffffffffffffffff : 0x0,//r1=(_A1 <= _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmple_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 > _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmpgt_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 > _B0) ? 0xffffffffffffffff : 0x0,//r1=(_A1 > _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmpgt_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 >= _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmpge_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 >= _B0) ? 0xffffffffffffffff : 0x0,//r1=(_A1 >= _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmpge_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 != _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmpneq_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 != _B0) ? 0xffffffffffffffff : 0x0,//r1=(_A1 != _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmpneq_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=!(_A0 < _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmpnlt_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=!(_A0 < _B0) ? 0xffffffffffffffff : 0x0,//r1=!(_A1 < _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmpnlt_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=!(_A0 <= _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmpnle_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=!(_A0 <= _B0) ? 0xffffffffffffffff : 0x0,//r1=!(_A1 <= _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmpnle_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=!(_A0 > _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmpngt_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=!(_A0 > _B0) ? 0xffffffffffffffff : 0x0,//r1=!(_A1 > _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmpngt_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=!(_A0 >= _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmpnge_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=!(_A0 >= _B0) ? 0xffffffffffffffff : 0x0,//r1=!(_A1 >= _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmpnge_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 ord _B0) ? 0xffffffffffffffff : 0x0,//r1=(_A1 ord _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmpord_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 ord _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmpord_sd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 unord _B0) ? 0xffffffffffffffff : 0x0,//r1=(_A1 unord _B1) ? 0xffffffffffffffff : 0x0 extern __m128d _mm_cmpunord_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=(_A0 unord _B0) ? 0xffffffffffffffff : 0x0,r1=_A1 extern __m128d _mm_cmpunord_sd(__m128d _A,__m128d _B); //返回一个0或1的整数,r=(_A0 != _B0) ? 0x1 : 0x0,If _A and _B is a NaN,1 is returned extern int _mm_comieq_sd(__m128d _A,__m128d _B); //返回一个0或1的整数,r=(_A0 < _B0) ? 0x1 : 0x0,1 is returned extern int _mm_comilt_sd(__m128d _A,__m128d _B); //返回一个0或1的整数,r=(_A0 <= _B0) ? 0x1 : 0x0,1 is returned extern int _mm_comile_sd(__m128d _A,__m128d _B); //返回一个0或1的整数,r=(_A0 > _B0) ? 0x1 : 0x0,0 is returned extern int _mm_comigt_sd(__m128d _A,__m128d _B); //返回一个0或1的整数,r=(_A0 >= _B0) ? 0x1 : 0x0,0 is returned extern int _mm_comige_sd(__m128d _A,0 is returned extern int _mm_comineq_sd(__m128d _A,__m128d _B); //返回一个0或1的整数,r=(_A0 == _B0) ? 0x1 : 0x0,1 is returned extern int _mm_ucomieq_sd(__m128d _A,1 is returned extern int _mm_ucomilt_sd(__m128d _A,1 is returned extern int _mm_ucomile_sd(__m128d _A,0 is returned extern int _mm_ucomigt_sd(__m128d _A,0 is returned extern int _mm_ucomige_sd(__m128d _A,0 is returned extern int _mm_ucomineq_sd(__m128d _A,__m128d _B); //Conversion Operations //返回一个__m128d的寄存器,r0=(dobule)_A0,r1=(double)_A1 extern __m128d _mm_cvtepi32_pd(__m128i _A); //返回一个__m128i的寄存器,r0=(int)_A0,r1=(int)_A1,r2=0x0,r3=0x0 extern __m128i _mm_cvtpd_epi32(__m128d _A); //返回一个__m128i的寄存器,r0=(int)_A0,r3=0x0,using truncate extern __m128i _mm_cvttpd_epi32(__m128d _A); //返回一个__m128的寄存器,r0=(flaot)_A0,r1=(float)_A1,r2=(float)_A2,r3=(float)_A3 extern __m128 _mm_cvtepi32_ps(__m128i _A); //返回一个__m128i的寄存器,r0=(int)_A0,r2=(int)_A2,r3=(int)_A3 extern __m128i _mm_cvtps_epi32(__m128 _A); //返回一个__m128i的寄存器,r0=(int)_A0,r3=(int)_A3,using truncate extern __m128i _mm_cvttps_epi32(__m128 _A); //返回一个__m128的寄存器,r0=(flaot)_A0,r2=0.0,r3=0.0 extern __m128 _mm_cvtpd_ps(__m128d _A); //返回一个__m128d的寄存器,r0=(dobule)_A0,r1=(double)_A1 extern __m128d _mm_cvtps_pd(__m128 _A); //返回一个__m128的寄存器,r0=(float)_B0,r1=_B1,r2=_B2,r3=_B3 extern __m128 _mm_cvtsd_ss(__m128 _A,__m128d _B); //返回一个__m128d的寄存器,r0=(double)_B0,r1=_A1 extern __m128d _mm_cvtss_sd(__m128d _A,__m128 _B); //返回一个32bit整数,r=(int)_A0 extern int _mm_cvtsd_si32(__m128d _A); //返回一个32bit整数,r=(int)_A0,using truncate extern int _mm_cvttsd_si32(__m128d _A); //返回一个__m128d的寄存器,r0=(double)_B,r1=_A1 extern __m128d _mm_cvtsi32_sd(__m128d _A,int _B); //返回一个__m64的寄存器,r0=(int)_A0,r1=(int)_A1 extern __m64 _mm_cvtpd_pi32(__m128d _A); //返回一个__m64的寄存器,r0=(int)_A0,r1=(int)_A1,using truncate extern __m64 _mm_cvttpd_pi32(__m128d _A); //返回一个__m128d的寄存器,r0=(dobule)_A0,r1=(double)_A1 extern __m128d _mm_cvtpi32_pd(__m64 _A); //Miscellaneous Operations(Floating-Point SSE2 Intrinsics) //返回一个__m128d的寄存器,r0=_A1,r1=_B1 extern __m128d _mm_unpackhi_pd(__m128d _A,__m128d _B); //返回一个__m128d的寄存器,r0=_A0,r1=_B0 extern __m128d _mm_unpacklo_pd(__m128d _A,__m128d _B); //返回一个2bit整数,r=sign(_A1) << 1 | sign(_A0) extern int _mm_movemask_pd(__m128d _A); //返回一个__m128d的寄存器,Selects two specific double-precision,// floating-point values from _A and _B,based on the mask _I,//The mask must be an immediate extern __m128d _mm_shuffle_pd(__m128d _A,__m128d _B,int _I); //Load Operations(Floating-Point SSE2 Intrinsics) //返回一个__m128d的寄存器,r0=_Dp[0],r1=_Dp[1],The address _Dp must be 16-byte aligned extern __m128d _mm_load_pd(double const*_Dp); //返回一个__m128d的寄存器,r0=*_Dp,r1=*_Dp,The address _Dp does not need //to be 16-byte aligned extern __m128d _mm_load1_pd(double const*_Dp); //返回一个__m128d的寄存器,r0=_Dp[1],r1=_Dp[0],The address _Dp must be 16-byte aligned extern __m128d _mm_loadr_pd(double const*_Dp); //返回一个__m128d的寄存器,r0=_Dp[0],The address _Dp does not //need to be 16-byte aligned extern __m128d _mm_loadu_pd(double const*_Dp); //返回一个__m128d的寄存器,r0=*_Dp,r1=0.0,The address _Dp does not //need to be 16-byte aligned extern __m128d _mm_load_sd(double const*_Dp); //返回一个__m128d的寄存器,r0=_A0,The address _Dp does not //need to be 16-byte aligned extern __m128d _mm_loadh_pd(__m128d _A,double const*_Dp); //返回一个__m128d的寄存器,r0=*_Dp,r1=_A1,The address _Dp does not //need to be 16-byte aligned extern __m128d _mm_loadl_pd(__m128d _A,double const*_Dp); //Set Operations(Floating-Point SSE2 Intrinsics) //返回一个__m128d的寄存器,r0=_W,r1=0.0 extern __m128d _mm_set_sd(double _W); //返回一个__m128d的寄存器,r0=_A,r1=_A extern __m128d _mm_set1_pd(double _A); //返回一个__m128d的寄存器,r0=_Y,r1=_Z extern __m128d _mm_set_pd(double _Z,double _Y); //返回一个__m128d的寄存器,r0=_Y,r1=_Z extern __m128d _mm_setr_pd(double _Y,double _Z); //返回一个__m128d的寄存器,r0=0.0,r1=0.0 extern __m128d _mm_setzero_pd(void); //返回一个__m128d的寄存器,r0=_B0,r1=_A1 extern __m128d _mm_move_sd(__m128d _A,__m128d _B); //Store Operations(Floating-Point SSE2 Intrinsics) //返回为空,*_Dp=_A0,The address _Dp does not need to be 16-byte aligned extern void _mm_store_sd(double *_Dp,__m128d _A); //返回为空,_Dp[0]=_A0,_Dp[1]=_A0,The address _Dp must be 16-byte aligned extern void _mm_store1_pd(double *_Dp,_Dp[1]=_A1,The address _Dp must be 16-byte aligned extern void _mm_store_pd(double *_Dp,The address _Dp does not need to be 16-byte aligned extern void _mm_storeu_pd(double *_Dp,__m128d _A); //返回为空,_Dp[0]=_A1,The address _Dp must be 16-byte aligned extern void _mm_storer_pd(double *_Dp,__m128d _A); //返回为空,*_Dp=_A1 extern void _mm_storeh_pd(double *_Dp,__m128d _A); //返回为空,*_Dp=_A0 extern void _mm_storel_pd(double *_Dp,__m128d _A); //new convert to float //返回一个64bit double类型,r=_A0,Extracts the lower order floating point value extern double _mm_cvtsd_f64(__m128d _A); //Cache Support for Streaming SIMD Extensions 2 Floating-Point Operations //返回为空,_Dp[0]=_A0,Stores the data in _A to the address _Dp without //polluting caches. The address _Dp must be 16-byte aligned. If the cache line //containing address _Dp is already in the cache,the cache will be updated extern void _mm_stream_pd(double *_Dp,__m128d _A); /*------------Integer Intrinsics Using Streaming SIMD Extensions 2-------------*/ //Arithmetic Operations(Integer SSE2 Intrinsics):add、sub、mul、avg、min、max //返回一个__m128i的寄存器,r0=_A0+_B0,r1=_A1+_B1,... r15=_A15+_B15 extern __m128i _mm_add_epi8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相加, //即ri=_Ai+_Bi(r0=_A0+_B0,... r7=_A7+_B7) extern __m128i _mm_add_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=_A0+_B0,r2=_A2+_B2,r3=_A3+_B3 extern __m128i _mm_add_epi32(__m128i _A,__m128i _B); //返回一个__m64的寄存器,r=_A+_B extern __m64 _mm_add_si64(__m64 _A,__m64 _B); //返回一个__m128i的寄存器,r0=_A0+_B0,r1=_A1+_B1 extern __m128i _mm_add_epi64(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=SignedSaturate(_A0+_B0),r1=SignedSaturate(_A1+_B1),... //r15=SignedSaturate(_A15+_B15),saturates extern __m128i _mm_adds_epi8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相加, //r0=SignedSaturate(_A0+_B0),... //r7=SignedSaturate(_A7+_B7),当计算结果溢出时将其置为边界值(saturates) extern __m128i _mm_adds_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0+_B0),r1=UnsignedSaturate(_A1+_B1),... //r15=UnsignedSaturate(_A15+_B15),saturates extern __m128i _mm_adds_epu8(__m128i _A,... //r7=UnsignedSaturate(_A7+_B7),saturates extern __m128i _mm_adds_epu16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=(_A0+_B0)/2,r1=(_A1+_B1)/2,... r15=(_A15+_B15)/2,rounds extern __m128i _mm_avg_epu8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit无符号整数取平均, //即ri=(_Ai+_Bi)/2(r0=(_A0+_B0)/2,... r7=(_A7+_B7)/2),rounds extern __m128i _mm_avg_epu16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,它含有4个有符号或无符号32bit的整数, //分别满足:r0=(_A0*_B0)+(_A1*_B1),r1=(_A2*_B2)+(_A3*_B3),//r2=(_A4*_B4)+(_A5*_B5),r3=(_A6*_B6)+(_A7*_B7) extern __m128i _mm_madd_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,取_A和_B中对应位置的16bit有符号或无符号整数的最大值, //即ri=max(_Ai,_Bi) (r0=max(_A0,_B1),r1=max(_A1,... r7=max(_A7,_B7)) extern __m128i _mm_max_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=max(_A0,... r15=max(_A15,_B15) extern __m128i _mm_max_epu8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,取_A和_B中对应位置的16bit有符号或无符号整数的最小值, //即ri=min(_Ai,_Bi)(r0=min(_A0,r1=min(_A1,... r7=min(_A7,_B7)) extern __m128i _mm_min_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=min(_A0,... r15=min(_A15,_B15) extern __m128i _mm_min_epu8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,它含8个有符号或无符号16bit的整数,分别为_A和_B对应位置的16bit //有符号或无符号整数相乘结果的高16bit数据,即ri=(_Ai*_Bi)[31:16](r0=(_A0*_B0)[31:16],//r1=(_A1*_B1)[31:16] ... r7=(_A7*_B7)[31:16]) extern __m128i _mm_mulhi_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=(_A0*_B0)[31:16],r1=(_A1*_B1)[31:16] ... r7=(_A7*_B7)[31:16] extern __m128i _mm_mulhi_epu16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,它含8个有符号或无符号16bit的整数,分别为_A和_B对应位置的16bit //有符号或无符号整数相乘结果的低16bit数据,即ri=(_Ai*_Bi)[15:0](r0=(_A0*_B0)[15:0],//r1=(_A1*_B1)[15:0] ... r7=(_A7*_B7)[15:0]) extern __m128i _mm_mullo_epi16(__m128i _A,__m128i _B); //返回一个__m64的寄存器,r=_A0*_B0 extern __m64 _mm_mul_su32(__m64 _A,__m64 _B); //返回一个__m128i的寄存器,r0=_A0*_B0,r1=_A2*_B2 extern __m128i _mm_mul_epu32(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=abs(_A0-_B0) + abs(_A1-_B1) + ... + abs(_A7-_B7),//r1=0x0,r3=0x0,r4=abs(_A8-_B8) + abs(_A9-_B9) + ... + abs(_A15-_B15),//r5=0x0,r6=0x0,r7=0x0 extern __m128i _mm_sad_epu8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=_A0-_B0,r1=_A1-_B1,... r15=_A15-_B15 extern __m128i _mm_sub_epi8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相减, //即ri=_Ai-_Bi(r0=_A0-_B0,... r7=_A7-_B7) extern __m128i _mm_sub_epi16(__m128i _A,r2=_A2-_B2,r3=_A3-_B3 extern __m128i _mm_sub_epi32(__m128i _A,__m128i _B); //返回一个__m64的寄存器,r=_A-_B extern __m64 _mm_sub_si64(__m64 _A,__m64 _B); //返回一个__m128i的寄存器,r0=_A0-_B0,r1=_A1-_B1 extern __m128i _mm_sub_epi64(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=SignedSaturate(_A0-_B0),r1=SignedSaturate(_A1-_B1),... //r15=SignedSaturate(_A15-_B15),saturate extern __m128i _mm_subs_epi8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,将_A和_B中对应位置的16bit有符号或无符号整数分别相减, //当计算结果溢出时将其置为边界值(saturate),r0=SignedSaturate(_A0-_B0),//r1=SignedSaturate(_A1-_B1),... r7=SignedSaturate(_A7-_B7) extern __m128i _mm_subs_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0-_B0),r1=UnsignedSaturate(_A1-_B1),... //r15=UnsignedSaturate(_A15-_B15),saturate extern __m128i _mm_subs_epu8(__m128i _A,... //r15=UnsignedSaturate(_A7-_B7),saturate extern __m128i _mm_subs_epu16(__m128i _A,__m128i _B); //Logical Operations(Integer SSE2 Intrinsics):and、or、xor、andnot //返回一个__m128i的寄存器,将寄存器_A和寄存器_B的对应位进行按位与运算,r=_A & _B extern __m128i _mm_and_si128(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,将寄存器_A每一位取非,然后和寄存器_B的每一位进行按位与运算,//r=(~_A) & _B extern __m128i _mm_andnot_si128(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,将寄存器_A和寄存器_B的对应位进行按位或运算,r=_A | _B extern __m128i _mm_or_si128(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,将寄存器_A和寄存器_B的对应位进行按位异或运算,r=_A ^ _B extern __m128i _mm_xor_si128(__m128i _A,__m128i _B); //Shift Operations //返回一个__m128i的寄存器,r=_A << (_Imm * 8),_Imm must be an immediate,//shifting in zeros extern __m128i _mm_slli_si128(__m128i _A,int _Imm); //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count进行相同的逻辑左移,//r0=_A0 << _Count,r1=_A1 << _Count,... r7=_A7 << count,shifting in zeros extern __m128i _mm_slli_epi16(__m128i _A,int _Count); //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count寄存器中对应位置的整数 //进行逻辑左移,r0=_A0 << _Count,shifting in zeros extern __m128i _mm_sll_epi16(__m128i _A,__m128i _Count); //返回一个__m128i的寄存器,r0=_A0 << _Count,r2=_A2 << count,//r3=_A3 << count,shifting in zeros extern __m128i _mm_slli_epi32(__m128i _A,int _Count); //返回一个__m128i的寄存器,r0=_A0 << _Count,shifting in zeros extern __m128i _mm_sll_epi32(__m128i _A,shifting in zeros extern __m128i _mm_slli_epi64(__m128i _A,shifting in zeros extern __m128i _mm_sll_epi64(__m128i _A,__m128i _Count); //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count进行相同的算术右移,//r0=_A0 >> _Count,r1=_A1 >> _Count,... r7=_A7 >> count,shifting in the sign bit extern __m128i _mm_srai_epi16(__m128i _A,int _Count); //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count寄存器中对应位置的整数进行 //算术右移,r0=_A0 >> _Count,shifting in the sign bit extern __m128i _mm_sra_epi16(__m128i _A,__m128i _Count); //返回一个__m128i的寄存器,r0=_A0 >> _Count,r3=_A3 >> count,//r4=_A4 >> count,shifting in the sign bit extern __m128i _mm_srai_epi32(__m128i _A,int _Count); //返回一个__m128i的寄存器,r0=_A0 >> _Count,shifting in the sign bit extern __m128i _mm_sra_epi32(__m128i _A,__m128i _Count); //返回一个__m128i的寄存器,r=srl(_A,_Imm * 8),//shifting in zeros extern __m128i _mm_srli_si128(__m128i _A,int _Imm); //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count进行相同的逻辑右移, //移位填充值为0,r0=srl(_A0,_Count),r1=srl(_A1,... r7=srl(_A7,//shifting in zeros extern __m128i _mm_srli_epi16(__m128i _A,int _Count); //返回一个__m128i的寄存器,将寄存器_A中的8个16bit整数按照_Count寄存器中对应位置的整数 //进行逻辑右移,移位填充值为0,r1=srl(_A1,... //r7=srl(_A7,shifting in zeros extern __m128i _mm_srl_epi16(__m128i _A,__m128i _Count); //返回一个__m128i的寄存器,r0=srl(_A0,r2=srl(_A2,//r3=srl(_A3,shifting in zeros extern __m128i _mm_srli_epi32(__m128i _A,int _Count); //返回一个__m128i的寄存器,r0=srl(_A0,shifting in zeros extern __m128i _mm_srl_epi32(__m128i _A,shifting in zeros extern __m128i _mm_srli_epi64(__m128i _A,shifting in zeros extern __m128i _mm_srl_epi64(__m128i _A,__m128i _Count); //Comparison Intrinsics(SSE2):==、>、< //返回一个__m128i的寄存器,r0=(_A0 == _B0) ? 0xff : 0x00,//r1=(_A1 == _B1) ? 0xff : 0x0,... r15=(_A15 == _B15) ? 0xff : 0x0 extern __m128i _mm_cmpeq_epi8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,分别比较寄存器_A和寄存器_B对应位置16bit整数是否相等,若相等, //该位置返回0xffff,否则返回0x0,即ri=(_Ai==_Bi)?0xffff:0x0(r0=(_A0 == _B0) ? 0xffff : 0x00,//r1=(_A1 == _B1) ? 0xffff : 0x0,... r7=(_A7 == _B7) ? 0xffff : 0x0) extern __m128i _mm_cmpeq_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=(_A0 == _B0) ? 0xffffffff : 0x00,//r1=(_A1 == _B1) ? 0xffffffff : 0x0,//r2=(_A2 == _B2) ? 0xffffffff : 0x0,r3=(_A3 == _B3) ? 0xffffffff : 0x0 extern __m128i _mm_cmpeq_epi32(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=(_A0 > _B0) ? 0xff : 0x00,r1=(_A1 > _B1) ? 0xff : 0x0,... //r15=(_A15 > _B15) ? 0xff : 0x0 extern __m128i _mm_cmpgt_epi8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,分别比较寄存器_A的每个16bit整数是否大于寄存器_B对应位置16bit的整数, //若大于,该位置返回0xffff,否则返回0x0, //即ri=(_Ai>_Bi)?0xffff:0x0(r0=(_A0 > _B0) ? 0xffff : 0x00,//r1=(_A1 > _B1) ? 0xffff : 0x0,... r7=(_A7 > _B7) ? 0xffff : 0x0) extern __m128i _mm_cmpgt_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=(_A0 > _B0) ? 0xffffffff : 0x00,//r1=(_A1 > _B1) ? 0xffffffff : 0x0,//r2=(_A2 > _B2) ? 0xffffffff : 0x0,r3=(_A3 > _B3) ? 0xffffffff : 0x0 extern __m128i _mm_cmpgt_epi32(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=(_A0 < _B0) ? 0xff : 0x00,r1=(_A1 < _B1) ? 0xff : 0x0,... //r15=(_A15 < _B15) ? 0xff : 0x0 extern __m128i _mm_cmplt_epi8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,分别比较寄存器_A的每个16bit整数是否小于寄存器_B对应位置16bit整数, //若小于,该位置返回0xffff,否则返回0x0, //即ri=(_Ai<_Bi)?0xffff:0x0(r0=(_A0 < _B0) ? 0xffff : 0x00,//r1=(_A1 < _B1) ? 0xffff : 0x0,... r7=(_A7 < _B7) ? 0xffff : 0x0) extern __m128i _mm_cmplt_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=(_A0 < _B0) ? 0xffffffff : 0x00,//r1=(_A1 < _B1) ? 0xffffffff : 0x0,//r2=(_A2 < _B2) ? 0xffffffff : 0x0,r3=(_A3 < _B3) ? 0xffffffff : 0x0 extern __m128i _mm_cmplt_epi32(__m128i _A,__m128i _B); //Conversion Intrinsics: int <-----> __m128i //返回一个__m128i的寄存器,r0=_A,r1=0x0,r3=0x0 extern __m128i _mm_cvtsi32_si128(int _A); //返回一个32bit整数,r=_A0 extern int _mm_cvtsi128_si32(__m128i _A); //Miscellaneous Operations(Integer SSE2 Intrinsics) //返回一个__m128i的寄存器,r0=SignedSaturate(_A0),r1=SignedSaturate(_A1),... //r7=SignedSaturate(_A7),r8=SignedSaturate(_B0),r9=SignedSaturate(_B1),... //r15=SignedSaturate(_B7),saturate extern __m128i _mm_packs_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=SignedSaturate(_A0),//r2=SignedSaturate(_A2),r3=SignedSaturate(_A3),r4=SignedSaturate(_B0),//r5=SignedSaturate(_B1),r6=SignedSaturate(_B2),r7=SignedSaturate(_B3),saturate extern __m128i _mm_packs_epi32(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=UnsignedSaturate(_A0),r1=UnsignedSaturate(_A1),... //r7=UnsignedSaturate(_A7),r8=UnsignedSaturate(_B0),r9=UnsignedSaturate(_B1),... //r15=UnsignedSaturate(_B7),saturate extern __m128i _mm_packus_epi16(__m128i _A,__m128i _B); //返回一个16bit整数,根据_Imm从_A中8个16bit数中选取对应编号的数,//r=(_Imm == 0) ? _A0 : ((_Imm == 1) ? _A1 : ... (_Imm == 7) ? _A7),//_Imm must be an immediate,zero extends extern int _mm_extract_epi16(__m128i _A,int _Imm); //返回一个__m128i的寄存器,根据_Imm将_A中8个16bit数中对应编号的数替换为_B,//r0=(_Imm == 0) ? _B : _A0; r1=(_Imm == 1) : _B : _A1,... r7=(_Imm == 7) ? _B : _A7 extern __m128i _mm_insert_epi16(__m128i _A,int _B,int _Imm); //返回一个16bit整数,r=(_A15[7] << 15) | (_A14[7] << 14) ... (_A1[7] << 1) | _A0[7],//zero extends the upper bits extern int _mm_movemask_epi8(__m128i _A); //返回一个__m128i的寄存器,它是将_A中128bit数据以32bit为单位重新排列得到的,_Imm为有 //一个四元组,表示重新排列的顺序。当_A中原本存储的整数为16bit时,这条指令将其两两一组 //进行排列。例如,_A=(_A0,_A1,_A2,_A3,_A4,_A5,_A6,_A7),_Imm=(2,3,1),其中_Ai为16bit整数, //_A0为低位,返回结果为(_A2,_A0,_A7,_A5),_Imm must be an immediate extern __m128i _mm_shuffle_epi32(__m128i _A,int _Imm); //返回一个__m128i的寄存器,它是将_A中高64bit数据以16bit为单位重新排列得到的,_Imm为一个四元组, //表示重新排列的顺序。_A中低64bit数据顺序不变。例如,_A=(_A0,//_Imm=(2,其中_Ai为16bit整数,_A0为低位,返回结果为(_A0,_A6),//_Imm must be an immediate extern __m128i _mm_shufflehi_epi16(__m128i _A,int _Imm); //返回一个__m128i的寄存器,它是将_A中低64bit数据以16bit为单位重新排列得到的,_Imm为一个四元组, //表示重新排列的顺序。_A中高64bit数据顺序不变。例如,_A=(_A0,其中_Ai为16bit整数,_A0为低位,返回结果为(_A1,//_Imm must be an immediate extern __m128i _mm_shufflelo_epi16(__m128i _A,int _Imm); //返回一个__m128i的寄存器,r0=_A8,r1=_B8,r2=_A9,r3=_B9,... r14=_A15,r15=_B15 extern __m128i _mm_unpackhi_epi8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的高64bit数以16bit为单位交织在一块。 //例如,_A=(_A0,_B=(_B0,_B1,_B2,_B3,_B4,_B5,_B6,_B7),//其中_Ai,_Bi为16bit整数,_A0,_B0为低位,返回结果为(_A4,//r0=_A4,r1=_B4,r2=_A5,r3=_B5,r4=_A6,r5=_B6,r6=_A7,r7=_B7 extern __m128i _mm_unpackhi_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的高64bit数以32bit为单位交织在一块。 //例如,_A=(_A0,//r0=_A2,r1=_B2,r2=_A3,r3=_B3 extern __m128i _mm_unpackhi_epi32(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的高64bit数以64bit为单位交织在一块。 //例如,_A=(_A0,_B0为低位, //返回结果为(_A4,r0=_A1,r1=_B1 extern __m128i _mm_unpackhi_epi64(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,r0=_A0,r1=_B0,r2=_A1,r3=_B1,... r14=_A7,r15=_B7 extern __m128i _mm_unpacklo_epi8(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的低64bit数以16bit为单位交织在一块。 //例如,_A=(_A0,_B0为低位,返回结果为(_A0,_B0,_B3),//r0=_A0,r4=_A2,r5=_B2,r6=_A3,r7=_B3 extern __m128i _mm_unpacklo_epi16(__m128i _A,__m128i _B); //返回一个__m128i的寄存器,它将寄存器_A和寄存器_B的低64bit数以32bit为单位交织在一块。 //例如,_A=(_A0,r3=_B1 extern __m128i _mm_unpacklo_epi32(__m128i _A,//ro=_A0,r1=_B0 extern __m128i _mm_unpacklo_epi64(__m128i _A,__m128i _B); //Load Operations(Integer SSE2 Intrinsics) //返回为一个__m128i的寄存器,它将_P指向的数据读到指定寄存器中,实际使用时, //_P一般是通过类型转换得到的,Address _P must be 16-byte aligned extern __m128i _mm_load_si128(__m128i const*_P); //返回一个__m128i的寄存器,Loads 128-bit value,Address _P does not need be 16-byte aligned extern __m128i _mm_loadu_si128(__m128i const*_P); //返回一个__m128i的寄存器,r0=*p[63:0],zeroing the upper 64 bits of the result extern __m128i _mm_loadl_epi64(__m128i const*_P); //Set Operations(Integer SSE2 Intrinsics) //返回一个__m128i的寄存器,r0=_Q0,r1=_Q1 extern __m128i _mm_set_epi64(__m64 _Q1,__m64 _Q0); //返回一个__m128i的寄存器,r0=_I0,r1=_I1,r2=_I2,r3=_I3 extern __m128i _mm_set_epi32(int _I3,int _I2,int _I1,int _I0); //返回一个__m128i的寄存器,使用8个具体的short型数据来设置寄存器存放数据,//r0=_W0,r1=_W1,... r7=_W7 extern __m128i _mm_set_epi16(short _W7,short _W6,short _W5,short _W4,short _W3,short _W2,short _W1,short _W0); //返回一个__m128i的寄存器,r0=_B0,... r15=_B15 extern __m128i _mm_set_epi8(char _B15,char _B14,char _B13,char _B12,char _B11,char _B10,char _B9,char _B8,char _B7,char _B6,char _B5,char _B4,char _B3,char _B2,char _B1,char _B0); //返回一个__m128i的寄存器,r0=_Q,r1=_Q extern __m128i _mm_set1_epi64(__m64 _Q); //返回一个__m128i的寄存器,r0=_I,r1=_I,r2=_I,r3=_I extern __m128i _mm_set1_epi32(int _I); //返回一个__m128i的寄存器,r0=_W,r1=_W,... r7=_W extern __m128i _mm_set1_epi16(short _W); //返回一个__m128i的寄存器,r0=_B,r1=_B,... r15=_B extern __m128i _mm_set1_epi8(char _B); //返回一个__m128i的寄存器,r=_Q extern __m128i _mm_setl_epi64(__m128i _Q); //返回一个__m128i的寄存器,r0=_Q0,r1=_Q1 extern __m128i _mm_setr_epi64(__m64 _Q0,__m64 _Q1); //返回一个__m128i的寄存器,r0=_I0,r3=_I3 extern __m128i _mm_setr_epi32(int _I0,int _I3); //返回一个__m128i的寄存器,r0=_W0,... r7=_W7 extern __m128i _mm_setr_epi16(short _W0,short _W7); //返回一个__m128i的寄存器,r0=_B15,r1=_B14,... r15=_B0 extern __m128i _mm_setr_epi8(char _B15,char _B0); //返回一个__m128i的寄存器,r=0x0 extern __m128i _mm_setzero_si128(void); //Store Operations(Integer SSE2 Intrinsics) //返回为空,它将寄存器_B中的数据存储到_P指向的地址中,实际使用时, //_P一般是通过类型转换得到的,*_P = _B,Address _P must be 16-byte aligned extern void _mm_store_si128(__m128i *_P,__m128i _B); //返回为空,*_P=_B,Address _P does not need to be 16-byte aligned extern void _mm_storeu_si128(__m128i *_P,__m128i _B); //返回为空,*_P[63:0] =_Q0,lower 64 bits extern void _mm_storel_epi64(__m128i *_P,__m128i _Q); //返回为空,if(_N0[7]) _P[0]=_D0,if(_N1[7]) _P[1]=_D1,... if(_N15[7]) _P[15]=_D15,//The high bit of each byte in the selector _N determines whether the corresponding byte //in _D will be stored. Address _P does not need to be 16-byte aligned extern void _mm_maskmoveu_si128(__m128i _D,__m128i _N,char *_P); //Integer,moves //返回一个__m128i的寄存器,r0=_Q0,r1=0x0,zeroing the upper bits extern __m128i _mm_move_epi64(__m128i _Q); //返回一个__m128i的寄存器,r0=_Q,zeroing the upper bits extern __m128i _mm_movpi64_epi64(__m64 _Q); //返回一个__m64的寄存器,r=_Q0 extern __m64 _mm_movepi64_pi64(__m128i _Q); //Cache Support for Steaming SIMD Extensions 2 Integer Operations //返回为空,*_P=_A,Stores the data in _A to the address _P without polluting the caches. //If the cache line containing address _P is already in the cache,the cache will be updated. //Address _P must be 16-byte aligned extern void _mm_stream_si128(__m128i *_P,__m128i _A); //返回为空,Cache line containing _P is flushed and invalidated from //all caches in the coherency domain extern void _mm_clflush(void const*_P); //返回为空,Guarantees that every load instruction that precedes,in program order,the load //fence instruction is globally visible before any load instruction //that follows the fence in program order extern void _mm_lfence(void); //返回为空,Guarantees that every memory access that precedes,//the memory fence instruction is globally visible before any memory instruction //that follows the fence in program order extern void _mm_mfence(void); //返回为空,*_P=_I,Stores the data in _I to the address _P without polluting the caches. //If the cache line containing address _P is already in the cache,the cache will be updated extern void _mm_stream_si32(int *_P,int _I); //返回为空,The execution of the next instruction is delayed an implementation specific //amount of time. The instruction does not modify the architectural state. This intrinsic //provides especially significant performance gain extern void _mm_pause(void); /*---Support for casting between varIoUs SP,DP,INT vector types. Note that these do no conversion of values,they just change the type----*/ //返回一个__m128的寄存器,Applies a type cast to reinterpret two 64-bit floating //point values passed in as a 128-bit parameter as packed 32-bit floating point values extern __m128 _mm_castpd_ps(__m128d); //返回一个__m128i的寄存器,Applies a type cast to reinterpret two 64-bit //floating point values passed in as a 128-bit parameter as packed 32-bit integers extern __m128i _mm_castpd_si128(__m128d); //返回一个__m128d的寄存器,Applies a type cast to reinterpret four 32-bit floating //point values passed in as a 128-bit parameter as packed 64-bit floating point values extern __m128d _mm_castps_pd(__m128); //返回一个__m128i的寄存器,Applies a type cast to reinterpret four 32-bit floating //point values passed in as a 128-bit parameter as packed 32-bit integers extern __m128i _mm_castps_si128(__m128); //返回一个__m128的寄存器,Applies a type cast to reinterpret four 32-bit integers //passed in as a 128-bit parameter as packed 32-bit floating point values extern __m128 _mm_castsi128_ps(__m128i); //返回一个__m128d的寄存器,Applies a type cast to reinterpret four 32-bit //integers passed in as a 128-bit parameter as packed 64-bit floating point values extern __m128d _mm_castsi128_pd(__m128i);
android.graphics.Paint.FontMetricsInt的实例源码
private void drawData(Canvas canvas) { // Draw the selected text first,and then draw up the rest of the text. float scale = parabola(mViewHeight / 4.0f,mMoveLen); float size = (mMaxTextSize - mMinTextSize) * scale + mMinTextSize; mPaint.setTextSize(size); mPaint.setAlpha((int) ((mMaxTextAlpha - mMinTextAlpha) * scale + mMinTextAlpha)); // Text center drawing,pay attention to the calculation of baseline to reach the center,y value is text central coordinates. float x = (float) (mViewWidth / 2.0); float y = (float) (mViewHeight / 2.0 + mMoveLen); FontMetricsInt fmi = mPaint.getFontMetricsInt(); float baseline = (float) (y - (fmi.bottom / 2.0 + fmi.top / 2.0)); canvas.drawText(mDataList.get(mCurrentSelected),x,baseline,mPaint); // Draw the top data. for (int i = 1; (mCurrentSelected - i) >= 0; i++) { drawOtherText(canvas,i,-1); } // Draw below data. for (int i = 1; (mCurrentSelected + i) < mDataList.size(); i++) { drawOtherText(canvas,1); } }
@Override protected void onMeasure(int widthMeasureSpec,int heightMeasureSpec) { final int size = MeasureSpec.getSize(widthMeasureSpec); final int mode = MeasureSpec.getMode(widthMeasureSpec); if (!useSystemEmoji() && getEllipsize() == TruncateAt.END && !TextUtils.isEmpty(source) && (mode == MeasureSpec.AT_MOST || mode == MeasureSpec.EXACTLY) && getPaint().breakText(source,source.length()-1,true,size,null) != source.length()) { needsEllipsizing = true; FontMetricsInt font = getPaint().getFontMetricsInt(); super.onMeasure(MeasureSpec.makeMeasureSpec(size,MeasureSpec.EXACTLY),MeasureSpec.makeMeasureSpec(Math.abs(font.top - font.bottom),MeasureSpec.EXACTLY)); } else { needsEllipsizing = false; super.onMeasure(widthMeasureSpec,heightMeasureSpec); } }
@Override public int getSize(Paint paint,CharSequence text,int start,int end,FontMetricsInt fm) { Rect rect = this.getDrawable().getBounds(); if (fm != null) { FontMetricsInt fontMetricsInt = paint.getFontMetricsInt(); end = fontMetricsInt.bottom - fontMetricsInt.top; int var = rect.bottom - rect.top; start = var / 2 - end / 4; end = var / 2 + end / 4; fm.ascent = -end; fm.top = -end; fm.bottom = start; fm.descent = start; } return rect.right; }
private void drawData(Canvas canvas) { // 先绘制选中的text再往上往下绘制其余的text float scale = parabola(mViewHeight / 4.0f,mMoveLen); float size = (mMaxTextSize - mMinTextSize) * scale + mMinTextSize; mPaint.setTextSize(size); mPaint.setAlpha((int) ((mMaxTextAlpha - mMinTextAlpha) * scale + mMinTextAlpha)); // text居中绘制,注意baseline的计算才能达到居中,y值是text中心坐标 float x = (float) (mViewWidth / 2.0); float y = (float) (mViewHeight / 2.0 + mMoveLen); FontMetricsInt fmi = mPaint.getFontMetricsInt(); float baseline = (float) (y - (fmi.bottom / 2.0 + fmi.top / 2.0)); if (mDataList.size() > 0) { canvas.drawText(mDataList.get(mCurrentSelected),mPaint); } // 绘制上方data for (int i = 1; (mCurrentSelected - i) >= 0; i++) { drawOtherText(canvas,-1); } // 绘制下方data for (int i = 1; (mCurrentSelected + i) < mDataList.size(); i++) { drawOtherText(canvas,1); } }
private void drawData(Canvas canvas) { // 先绘制选中的text再往上往下绘制其余的text float scale = parabola(mViewHeight / 4.0f,mMoveLen); float size = (mMaxTextSize - mMinTextSize) * scale + mMinTextSize; mPaint.setTextSize(size); mPaint.setAlpha((int) ((mMaxTextAlpha - mMinTextAlpha) * scale + mMinTextAlpha)); // text居中绘制,注意baseline的计算才能达到居中,y值是text中心坐标 float x = (float) (mViewWidth / 2.0); float y = (float) (mViewHeight / 2.0 + mMoveLen); FontMetricsInt fmi = mPaint.getFontMetricsInt(); float baseline = (float) (y - (fmi.bottom / 2.0 + fmi.top / 2.0)); if (null != mItemProvider) { String itemData = mItemProvider.getItem(mDataList.get(mCurrentSelected),mCurrentSelected); canvas.drawText(itemData,1); } }
/** * @param canvas * @param position 距离mCurrentSelected的差值 * @param type 1表示向下绘制,-1表示向上绘制 */ private void drawOtherText(Canvas canvas,int position,int type) { float d = (float) (MARGIN_ALPHA * mMinTextSize * position + type * mMoveLen); float scale = parabola(mViewHeight / 4.0f,d); float size = (mMaxTextSize - mMinTextSize) * scale + mMinTextSize; mPaint.setTextSize(size); mPaint.setAlpha((int) ((mMaxTextAlpha - mMinTextAlpha) * scale + mMinTextAlpha)); float y = (float) (mViewHeight / 2.0 + type * d); FontMetricsInt fmi = mPaint.getFontMetricsInt(); float baseline = (float) (y - (fmi.bottom / 2.0 + fmi.top / 2.0)); if (null != mItemProvider) { int index = mCurrentSelected + type * position; String itemData = mItemProvider.getItem(mDataList.get(index),index); canvas.drawText(itemData,(float) (mViewWidth / 2.0),mPaint); } }
private void drawProgress(Canvas canvas) { Paint bgPaint = getProgresspaint(); bgPaint.setColor(this.mIndicatorBgColor); canvas.drawLine(14.0f,(float) (this.mHeight / 2),(float) (this.mWidth - 14),(float) (this.mHeight / 2),bgPaint); if (this.mProgress != 0.0f) { Paint progresspaint = getProgresspaint(); progresspaint.setColor(this.mIndicatorProgressColor); int stopX = (int) (((float) (this.mWidth - 28)) * this.mProgress); canvas.drawLine(14.0f,(float) stopX,(float) (this .mHeight / 2),progresspaint); Paint textPain = getAlertPaint(); textPain.setTextSize((float) this.mIndicatorSize); textPain.setColor(this.mIndicatorTextColor); int textWidth = ViewUtils.getTextWidth(textPain,this.mAlert); Paint alertPaint = getProgresspaint(); alertPaint.setColor(this.mIndicatorProgressColor); alertPaint.setstrokeWidth(78.4f); canvas.drawLine((float) (stopX - (textWidth / 2)),(float) ((textWidth / 2) + stopX),alertPaint); FontMetricsInt fmi = textPain.getFontMetricsInt(); Canvas canvas2 = canvas; canvas2.drawText(this.mAlert,(float) (stopX - (textWidth / 2)),(float) ((this .mHeight / 2) + (Math.abs(fmi.bottom + fmi.top) / 2)),textPain); } }
@Override protected void onMeasure(int widthMeasureSpec,heightMeasureSpec); } }
private void drawData(Canvas canvas) { // 先绘制选中的text再往上往下绘制其余的text float scale = parabola(mViewHeight / 4.0f,mMoveLen); float size = (mMaxTextSize - mMinTextSize) * scale + mMinTextSize; mPaint.setTextSize(size); mPaint.setAlpha((int) ((mMaxTextAlpha - mMinTextAlpha) * scale + mMinTextAlpha)); // text居中绘制,注意baseline的计算才能达到居中,y值是text中心坐标 float x = (float) (mViewWidth / 2.0); float y = (float) (mViewHeight / 2.0 + mMoveLen); FontMetricsInt fmi = mPaint.getFontMetricsInt(); float baseline = (float) (y - (fmi.bottom / 2.0 + fmi.top / 2.0)); canvas.drawText(mDataList.get(mCurrentSelected),mPaint); // 绘制上方data for (int i = 1; (mCurrentSelected - i) >= 0; i++) { drawOtherText(canvas,1); } }
/** * 代码跟父类代码相似,就是getCachedDrawable()替换成getDrawable(),因为前者里面的图片是WeakReference, * 容易被gc回收,所以这里要避免这个问题 */ @Override public int getSize(Paint paint,FontMetricsInt fm) { Drawable d = getDrawable(); if (lineHeight > 0) { return (int) (d.getIntrinsicWidth() * scale); } else { Rect rect = d.getBounds(); if (fm != null) { fm.ascent = -rect.bottom; fm.descent = 0; fm.top = fm.ascent; fm.bottom = 0; } return rect.right; } }
private void drawData(Canvas canvas) { // 先绘制选中的text再往上往下绘制其余的text float scale = parabola(mViewHeight / 4.0f,1); } }
private void drawData(Canvas canvas) { // 先绘制选中的text再往上往下绘制其余的text float scale = parabola(mViewHeight / 3.2f,1); } }
private void drawData(Canvas canvas) { // 先绘制选中的text再往上往下绘制其余的text float scale = parabola(mViewHeight / 4.0f,mMoveLen); float size = (mMaxTextSize - mMinTextSize) * scale + mMinTextSize; mPaint.setTextSize(size); mPaint.setAlpha((int) ((mMaxTextAlpha - mMinTextAlpha) * scale + mMinTextAlpha)); // text居中绘制,注意baseline的计算才能达到居中,y值是text中心坐标 float x = (float) (mViewWidth / 2.0); float y = (float) (mViewHeight / 2.0 + mMoveLen); FontMetricsInt fmi = mPaint.getFontMetricsInt(); float baseline = (float) (y - (fmi.bottom / 2.0 + fmi.top / 2.0)); canvas.drawText(mDataList.get(mCurrentSelected).getPickName(),1); } }
private void drawData(Canvas canvas) { // 先绘制选中的text再往上往下绘制其余的text float scale = parabola(mViewHeight / 4.0f,1); } }
private void drawData(Canvas canvas) { // 先绘制选中的text再往上往下绘制其余的text float scale = parabola(mViewHeight / 4.0f,1); } }
@Override protected void onMeasure(int widthMeasureSpec,heightMeasureSpec); } }
private void drawData(Canvas canvas) { // 先绘制选中的text再往上往下绘制其余的text float scale = parabola(mViewHeight / 4.0f,mPaint); //暴力法则,上下都最多绘制10条数据 int count = Math.min(10,mDataList.size()); // 绘制上方data for (int i = 1; i <= count; i++) { drawOtherText(canvas,-1); } // 绘制下方data for (int i = 1; i <= count; i++) { drawOtherText(canvas,1); } }
/** * @param canvas * @param position * 距离mCurrentSelected的差 * @param type * 1表示向下绘制,-1表示向上绘制 */ private void drawOtherText(Canvas canvas,int type) { float d = (float) (MARGIN_ALPHA * mMinTextSize * position + type * mMoveLen); float scale = parabola(mViewHeight / 4.0f,d); float size = (mMaxTextSize - mMinTextSize) * scale + mMinTextSize; mPaint.setTextSize(size); mPaint.setAlpha((int) ((mMaxTextAlpha - mMinTextAlpha) * scale + mMinTextAlpha)); float y = (float) (mViewHeight / 2.0 + type * d); FontMetricsInt fmi = mPaint.getFontMetricsInt(); float baseline = (float) (y - (fmi.bottom / 2.0 + fmi.top / 2.0)); int valuePosition = mCurrentSelected + type * position; //调整位置偏差在 合法范围 int dataSize = mDataList.size(); valuePosition = (valuePosition + dataSize) % dataSize; //if(valuePosition >= 0 & valuePosition < dataSize) {//以防万一 canvas.drawText(mDataList.get(valuePosition),mPaint); //} }
private static TextProperty computeTextProperty(final String string,final int width,final int height,final Paint paint) { final FontMetricsInt fm = paint.getFontMetricsInt(); final int h = (int) Math.ceil(fm.bottom - fm.top); int maxContentWidth = 0; final String[] lines = Cocos2dxBitmap.splitString(string,width,height,paint); if (width != 0) { maxContentWidth = width; } else { /* Compute the max width. */ int temp = 0; for (final String line : lines) { temp = (int) FloatMath.ceil(paint.measureText(line,line.length())); if (temp > maxContentWidth) { maxContentWidth = temp; } } } return new TextProperty(maxContentWidth,h,lines); }
private static int computeY(final FontMetricsInt fontMetricsInt,final int constrainHeight,final int totalHeight,final int verticalAlignment) { int y = -fontMetricsInt.top; if (constrainHeight > totalHeight) { switch (verticalAlignment) { case VERTICALALIGN_TOP: y = -fontMetricsInt.top; break; case VERTICALALIGN_CENTER: y = -fontMetricsInt.top + (constrainHeight - totalHeight) / 2; break; case VERTICALALIGN_BottOM: y = -fontMetricsInt.top + (constrainHeight - totalHeight); break; default: break; } } return y; }
private static TextProperty computeTextProperty(final String string,lines); }
private static int computeY(final FontMetricsInt fontMetricsInt,final int verticalAlignment) { int y = -fontMetricsInt.top; if (constrainHeight > totalHeight) { switch (verticalAlignment) { case VERTICALALIGN_TOP: y = -fontMetricsInt.top; break; case VERTICALALIGN_CENTER: y = -fontMetricsInt.top + (constrainHeight - totalHeight) / 2; break; case VERTICALALIGN_BottOM: y = -fontMetricsInt.top + (constrainHeight - totalHeight); break; default: break; } } return y; }
private static TextProperty computeTextProperty(final String pString,final int pWidth,final int pHeight,final Paint pPaint) { final FontMetricsInt fm = pPaint.getFontMetricsInt(); final int h = (int) Math.ceil(fm.bottom - fm.top); int maxContentWidth = 0; final String[] lines = Cocos2dxBitmap.splitString(pString,pWidth,pHeight,pPaint); if (pWidth != 0) { maxContentWidth = pWidth; } else { /* Compute the max width. */ int temp = 0; for (final String line : lines) { temp = (int) FloatMath.ceil(pPaint.measureText(line,lines); }
private static int computeY(final FontMetricsInt pFontMetricsInt,final int pConstrainHeight,final int pTotalHeight,final int pVerticalAlignment) { int y = -pFontMetricsInt.top; if (pConstrainHeight > pTotalHeight) { switch (pVerticalAlignment) { case VERTICALALIGN_TOP: y = -pFontMetricsInt.top; break; case VERTICALALIGN_CENTER: y = -pFontMetricsInt.top + (pConstrainHeight - pTotalHeight) / 2; break; case VERTICALALIGN_BottOM: y = -pFontMetricsInt.top + (pConstrainHeight - pTotalHeight); break; default: break; } } return y; }
@Override public int getSize(Paint paint,FontMetricsInt fm) { fm = fm == null ? paint.getFontMetricsInt() : fm; int iconSize = fm.bottom - fm.top; mDrawable.setBounds(0,iconSize,iconSize); return super.getSize(paint,text,start,end,fm); }
/** * @param position The difference between the distance mCurrentSelected. * @param type The difference from mCurrentSelected 1 means downward drawing,and -1 indicates upward rendering. */ private void drawOtherText(Canvas canvas,int type) { float d = MARGIN_ALPHA * mMinTextSize * position + type * mMoveLen; float scale = parabola(mViewHeight / 4.0f,d); float size = (mMaxTextSize - mMinTextSize) * scale + mMinTextSize; nPaint.setTextSize(size); nPaint.setAlpha((int) ((mMaxTextAlpha - mMinTextAlpha) * scale + mMinTextAlpha)); float y = (float) (mViewHeight / 2.0 + type * d); FontMetricsInt fmi = nPaint.getFontMetricsInt(); float baseline = (float) (y - (fmi.bottom / 2.0 + fmi.top / 2.0)); canvas.drawText(mDataList.get(mCurrentSelected + type * position),nPaint); }
@Override public int getSize(Paint paint,FontMetricsInt fm) { if (fm != null && this.fm != null) { fm.ascent = this.fm.ascent; fm.descent = this.fm.descent; fm.top = this.fm.top; fm.bottom = this.fm.bottom; return size; } else { return super.getSize(paint,fm); } }
/** * @param canvas * @param position 距离mCurrentSelected的差值 * @param type 1表示向下绘制,-1表示向上绘制 */ private void drawOtherText(Canvas canvas,int type) { float d = (float) (MARGIN_ALPHA * mMinTextSize * position + type * mMoveLen); float scale = parabola(mViewHeight / 4.0f,d); float size = (mMaxTextSize - mMinTextSize) * scale + mMinTextSize; mPaint.setTextSize(size); mPaint.setAlpha((int) ((mMaxTextAlpha - mMinTextAlpha) * scale + mMinTextAlpha)); float y = (float) (mViewHeight / 2.0 + type * d); FontMetricsInt fmi = mPaint.getFontMetricsInt(); float baseline = (float) (y - (fmi.bottom / 2.0 + fmi.top / 2.0)); canvas.drawText(mDataList.get(mCurrentSelected + type * position),mPaint); }
@Override public int getSize(Paint paint,FontMetricsInt fm) { if (fm != null && this.fm != null) { fm.ascent = this.fm.ascent; fm.descent = this.fm.descent; fm.top = this.fm.top; fm.bottom = this.fm.bottom; return size; } else { return super.getSize(paint,fm); } }
private String getFileAddedWatermark(String path,String text,String voice) { if (!new File(path).exists()) { return null; } Options options = new Options(); options.inJustDecodeBounds = false; Bitmap bitmap0 = BitmapFactory.decodeFile(path,options); int width = options.outWidth; int height = options.outHeight; Bitmap bitmapPic = Bitmap.createBitmap(width,Config.ARGB_8888); Canvas canvas = new Canvas(bitmapPic); canvas.drawBitmap(bitmap0,null,new Rect(0,height),null); Paint textPaint = new Paint(); textPaint.setColor(-1); textPaint.setTextSize((float) getTextSize()); textPaint.setFlags(2); canvas.drawText(text,text.length(),30.0f,50.0f,textPaint); Rect rect = new Rect(0,height - getTextBgHeight(),height); if (!isOrigPicMode) { Paint voiceBgPaint = new Paint(); voiceBgPaint.setColor(getResources().getColor(2131493164)); voiceBgPaint.setStyle(Style.FILL); voiceBgPaint.setFlags(2); canvas.drawRect(rect,voiceBgPaint); voiceBgPaint.setColor(-1); voiceBgPaint.setTextSize((float) getTextSize()); voiceBgPaint.setTextAlign(Align.CENTER); FontMetricsInt fontMetrics = voiceBgPaint.getFontMetricsInt(); int baseline = (rect.top + ((((rect.bottom - rect.top) - fontMetrics.bottom) + fontMetrics.top) / 2)) - fontMetrics.top; canvas.drawText(voice,(float) rect.centerX(),(float) baseline,voiceBgPaint); } return saveBitmap(bitmapPic); }
private boolean saveFileAddedVoice(String path,String voice,String savePath) { LogInfo.log("fornia","voice:" + voice); if (!new File(path).exists()) { return false; } Options options = new Options(); options.inJustDecodeBounds = false; Bitmap bitmap0 = BitmapFactory.decodeFile(path,null); Rect targetBgRect = new Rect(0,height); if (!isOrigPicMode) { Paint voiceBgPaint = new Paint(); voiceBgPaint.setColor(getResources().getColor(2131493164)); voiceBgPaint.setStyle(Style.FILL); voiceBgPaint.setFlags(2); canvas.drawRect(targetBgRect,voiceBgPaint); voiceBgPaint.setColor(-1); voiceBgPaint.setTextSize((float) getTextSize()); voiceBgPaint.setTextAlign(Align.CENTER); FontMetricsInt fontMetrics = voiceBgPaint.getFontMetricsInt(); String str = voice; canvas.drawText(str,(float) targetBgRect.centerX(),(float) ((targetBgRect.top + ((((targetBgRect.bottom - targetBgRect.top) - fontMetrics.bottom) + fontMetrics.top) / 2)) - fontMetrics.top),voiceBgPaint); } return FileUtils.saveBitmapByUser(this.mContext,bitmapPic); }
@Override public int getSize(@NonNull Paint paint,FontMetricsInt fm) { setupFontMetrics(text,fm,paint); if (fm != null) { final int padding = dims.getPadding(); final int margin = dims.getMarginTop(); fm.ascent = Math.min(fm.top,fm.ascent - padding) - margin; fm.descent = Math.max(fm.bottom,padding); fm.top = fm.ascent; fm.bottom = fm.descent; } return measureWidth(txtPaint,dims.isRtl()); }
private void setupFontMetrics(CharSequence text,FontMetricsInt fm,Paint p) { txtPaint.set(p); final CharacterStyle[] otherSpans = ((Spanned) text).getSpans(start,CharacterStyle.class); for (CharacterStyle otherSpan : otherSpans) { otherSpan.updateDrawState(txtPaint); } txtPaint.setTextSize(p.getTextSize()); if (fm != null) { txtPaint.getFontMetricsInt(fm); } }
/** * @param wp */ private static void expandMetricsFromPaint(FontMetricsInt fmi,TextPaint wp) { final int prevIoUsTop = fmi.top; final int prevIoUsAscent = fmi.ascent; final int prevIoUsDescent = fmi.descent; final int prevIoUsBottom = fmi.bottom; final int prevIoUsLeading = fmi.leading; wp.getFontMetricsInt(fmi); updateMetrics(fmi,prevIoUsTop,prevIoUsAscent,prevIoUsDescent,prevIoUsBottom,prevIoUsLeading); }
ARM Neon Intrinsics 学习指北:从入门、进阶到学个通透
【GiantPandaCV导语】Neon是手机普遍支持的计算加速指令集,是AI落地的工程利器。Neon Intrinsics 的出现,缓解了汇编语言难学难写的难题,值得工程师们开发利用。
前言
Neon是ARM平台的向量化计算指令集,通过一条指令完成多个数据的运算达到加速的目的,常用于AI、多媒体等计算密集型任务。
本文主要是一篇对ARM官方资料的导读。笔者根据自己Neon学习经历,将这些资料按照逻辑进行组织,以期减少读者的学习成本。
本文讨论的是Neon 的intrinsics,而非assembly。intrinsics是以类似调用C语言函数的方法调用Neon,并由编译器生成最终的二进制代码,assembly则是手工嵌入Neon汇编,直接生成二进制代码。如果您想了解的是Neon assembly,可以参考这篇文章:https://zhuanlan.zhihu.com/p/143328317。笔者后续也会补充assembly的内容。
下文将按以下目录组织,方便读者选读感兴趣的内容。
1.入门:基本能上手写Intrinsics
1.1 Neon介绍、简明案例与编程惯例
1.2 如何检索Intrinsics
1.3 优化效果案例
1.4 如何在Android应用Neon
2. 进阶:注意细节处理,学习常用算子的实现
2.1 与Neon相关的ARM体系结构
2.2 对非整数倍元素个数(leftovers)的处理技巧
2.3 算子源码学习(ncnn库,AI方向)
2.4 算子源码学习(Nvidia carotene库,图像处理方向 )
3. 学个通透:了解原理
3.1 SIMD加速原理
3.2 了解硬件决定的速度极限:Software Optimization Guide
3.3 反汇编分析生成代码质量
4. 其他:相关的研讨会视频、库、文档等
1 入门
1.1 Neon介绍、简明案例与编程惯例
推荐阅读ARM官方的 Optimizing C Code with Neon Intrinsics (https://developer.arm.com/documentation/102467/0100/)
该资料以HWC转CHW(permute)操作、矩阵乘法为例子,介绍如何将普通C++实现改写为Neon Intrinsics的实现。
重点:第6小节program conventions(编程惯例)介绍了Neon输出输出的对象类型和intrinsics命名规则。Intrinsics命名规则还是比较复杂的(如下图),如果没弄清楚,后期可能会检索不到需要的intrinsics或误用intrinsics。

1.2 如何检索Intrinsics
在1.1了解改写方法后,将自己的代码用相应的Intrinsics改写,即可应用Neon加速。ARM官方制作了intrinsics检索页面 (https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics)。
以float32类型的点对点乘法intrinsics vaddq_f32(q的含义请查阅上面命名规则,了解命名规则真的很重要)为例,从上到小描述操作、对应的汇编指令、入参、出参、伪代码和支持该intrinsics 的架构。


注意兼容性问题: 核查intrinsics是否能用于项目所需兼容的全部架构。例如,笔者进行的项目需要兼容新旧手机,且当前只能集成ARMv7a(新旧手机均能运行,即图中的v7)架构的so库,而Aarch64(仅新手机支持,即图中的A64)架构的so库尚不支持。如果某条intrinsics的supported architectures只有A64,那么在编译ARMv7a架构的so库时将会无法通过编译。对于这种情况,只能放弃使用这条intrinsics,改用多条intrinsics拼凑出等效实现(指令数的增加意味着性能的降低,为了兼容性这是无可奈何的事情)。
1.3 优化效果案例
为了应用Neon需要学那么多东东,究竟能达到怎样的效果?Intrinsics优化生成的汇编还和人工优化手写的汇编还有多少差距?别急,我们可以看下带有速度benchmark的案例,例如BBuf 写的 一份朴实无华的移动端盒子滤波算法优化笔记 (https://zhuanlan.zhihu.com/p/170611395)
直接把优化结果摘出来

优化好算法后,普通C++实现是302.00ms(不排除编译器优化了部分运算,生成了部分Neon指令),Intrinsics实现是188.37ms,最优Assembly实现是145.92ms。Intrinsics优化和Assembly优化分别加速了约38%和约52%,Intrinsics优化生成的代码还和手工优化的Assembly存在差距。但注意到,如果仅是单纯地用Assembly改写C++,优化效果和Intrinsics一样(188.37ms VS 187.70ms),进一步的性能改进来自于额外的预取指令(pld)和对双发射流水(硬件设计)的利用。简而言之,如果项目留有足够的优化时间并且对体系结构的驾驭能力有信心(至少速度不会低于Intrinsics),选择Assembly优化,否则选择Intrinsics优化。
1.4 如何在Android应用Neon
直接参考ARM官方的Demo制作教程(还带了演示编写代码的视频):
Neon Intrinsics Getting Started on Android(https://developer.arm.com/solutions/os/android/developer-guides/neon-intrinsics-getting-started-on-android)
Neon Intrinsics on Android How to Truncate Thresholding and Convolution of A 1D Signal (https://developer.arm.com/solutions/os/android/developer-guides/neon-intrinsics-on-android-how-to-truncate-thresholding-and-convolution-of-a-1d-signal)
核心点就是,在Gradle加上Neon的启用命令(如下红框),然后在cpp引用头文件 #include <arm_neon.h>,写代码就完事了。余下的都是带Native代码的安卓开发的知识。

关于在iOS开发中启用Neon(Xcode,尝试的版本为12.3):笔者尝试过将Android的NDK代码迁移至iOS,在不修改Xcode任何设置的情况下,arm_neon.h可找到,编译可通过,那说明Xcode默认打开了Neon的启用开关,写就完事了。
2 进阶
2.1 与Neon相关的ARM体系结构
利用指令集加速,无一例外地要利用专用寄存器这种在CPU上稀少、宝贵的资源。专用寄存器用少了CPU的性能不能充分发挥,用多了则会产生寄存器溢出(Register Spilling)(https://blog.csdn.net/qq_41112170/article/details/90286091)这种对性能有明显负面影响的问题。因此,我们至少需要了解在编写Neon代码时,有多少个专用寄存器可供利用,这就涉及到体系结构的知识。推荐阅读ARM官方的 Introducing Neon for Armv8-A(https://developer.arm.com/documentation/102474/0100/)
重点:
(1)了解registers, vectors,lanes, elements的概念以及它们对专用寄存器的占用;
(2)新的Armv8a架构有32个128bit向量寄存器,老的ArmV7a架构有32个64bit(可当作16个128bit)向量寄存器,编码时记得数一下占用多少个专用寄存器(例如1个float32x4就占用1个128bit寄存器),别用过量了,避免寄存器溢出(Register Spilling)(https://blog.csdn.net/qq_41112170/article/details/90286091)导致的负优化。
如果对ARM体系结构感兴趣,可以阅读更系统的 Cortex-A Series Programmer''s Guide(https://developer.arm.com/documentation/den0013/latest)。
2.2 对非整数倍元素个数(leftovers)的处理技巧
一条Neon指令最多可以计算4个float32,或者8个float16,或者16个int8。假设现在有3个或5个(即不是4的整数倍)float32需要计算,请问应该怎样解决呢?
ARM官方的Coding for Neon(https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/neon-programmers-guide-for-armv8-a/coding-for-neon/),在第4节 Load and store - leftovers给了处理技巧

除了处理leftovers,文章中还有一些操作值得学习,比如一条指令完成的shifting and inserting。
2.3 算子源码学习(ncnn库,AI方向)
ncnn(https://github.com/Tencent/ncnn/tree/e16b338b136c94805bc7d0ef3756f2dc4bfa3408/src/layer)是腾讯开源,nihui维护的AI推理引擎。2017开源至今,其代码依然保持着组织结构简单、易重用的优点。ncnn实现的算子包含普通实现(无针平台的加速)和针对3种平台(arm/vulkan/x86)的指令集加速实现(注:可能有的算子有普通实现,但没有平台加速实现,毕竟做加速实现还是比较耗费精力的,致敬nihui大大)。
由于Neon实现往往跟循环展开等技巧一起使用,代码往往比较长。我们可以先阅读普通实现的代码实现了解顶层逻辑,再阅读Neon实现的代码。例如,我们希望学习全连接层(innerproduct)的Neon实现,其普通实现的位置在ncnn/src/layer/innerproduct.cpp,对应的Neon加速实现的位置在ncnn/src/layer/arm/innerproduct_arm.cpp。

注意代码中出现较多的条件编译。原因我们上文提到过,有的intrinsics是较新Aarch64架构下专有的,为了兼容较老的ArmV7a架构,对于不能用的intrinsics只能用多条兼容的intrinsics等效替代。为了保证Aarch64下的性能同时保证对ArmV7a的兼容性,ncnn采用条件编译的方式处理不兼容的intrinsics(条件编译就是编译满足条件的分枝的代码,不满足条件的代码将不出现在最终的二进制文件中)。
如果你只关注Aarch64平台下的实现,下图#else分支的代码跳过不看即可。

2.4 算子源码学习(Nvidia carotene库,图像处理方向 )
了解carotene(https://github.com/opencv/opencv/tree/master/3rdparty/carotene)库的人应该不多,但了解OpenCV的人应该不少吧?carotene能够作为OpenCV的第三方库(third party)存在,足以证明其代码质量。
carotene的组织结构同样简单,且不依赖OpenCV的数据结构,想用想学习哪个函数直接提取出来即可。如下图,里面主要用Neon实现了色彩空间转换、均值滤波、Canny边缘检测等常见的图像处理算子。

3. 学个通透
3.1 SIMD加速原理
即使到了这里,我们仍然对Neon(或类似的SIMD指令)为什么能加速我们的代码充满疑问。我们可以在这本书(计算机体系结构的圣经之一)找寻答案——《计算机体系结构:量化研究方法》。Neon是ARM平台的SIMD(Single Instruction Multiple Data,单指令多数据流)指令集实现,书中4.1~4.3讨论了SIMD,推荐阅读。

关于这个问题,笔者的总结如下:
(1)通过加长的寄存器减少数据的读取/写入次数,从而减少将数据读入寄存器的时间开销。例如Neon可以一次性将16个int8(16*8=128bit)数据读入专用寄存器,这一次读取时间开销,明显少于16个int8数据一个一个地读入的时间之和。写入同理。(注意不要和cache的减少访存时间的原理混淆。从cache读取余下的第2~第16个int8数据到寄存器仍然是要花费时钟周期的)。
(2)执行SISD(single instruction, Single data,单指令流单数据流,这里可理解为标量计算)指令时,需要完成(时间开销大的)冒险(hazard)检查。既然使用SIMD指令计算,就暗示这些数据之间无依赖性,也就从指令集层面回避了不必要的时间开销。
3.2 了解硬件决定的速度极限:Software Optimization Guide
我们可能还要关心,我们所编写的Neon Intrinsics,可以将手头上硬件的性能发挥到多少水平?是否还有提升空间?这些是好问题。
在讨论一个问题前,先插入一个使笔者拍案叫绝的相关案例:在另一本计算经典**《深入理解计算机系统》** (一般简称 CS:APP)的第5章 优化程序性能 中,该书作者考虑若干计算机硬件特性,将矩阵乘法连续优化了6个版本,直至优化到了该x86 CPU的吞吐量上限(注:对于某种指令,延迟latency 主要关注单条该指令的最小执行时间,吞吐量throughout主要关注单位时间内系统(一个CPU核)最多执行多少条该指令。因为AI计算的数据量比较大,我们更关注吞吐量)。

回到问题,我们需要知道我们的吞吐量上界是多少。ARM官方为每个CPU架构(手机CPU一般大核是A7X架构,小核是A5X架构)提供对应的Software Optimization Guide,里面有进行各种运算的latency和throughout。以A76架构(采用该架构作为大核架构的CPU例如骁龙855,麒麟980)为例子,从ARM官网下载对应的pdf(https://developer.arm.com/documentation/swog307215/a/?lang=en)
翻到ASIMD(Advance SIMD)那里,就能查阅各条Neon指令相应的latency和throughout。不同架构的吞吐量上界会有所不同,其他架构请自行在ARM官网文档中心下载。

理论数据有了,至于如何通过实验测试峰值,可参考BBuf的文章 如何判断算法是否有可优化空间? (https://zhuanlan.zhihu.com/p/268925243)
3.3 反汇编分析生成代码质量
可通过反汇编的方式查看Intrinsics 生成的汇编是否满足预期,如果不满足预期则进行手写汇编优化。具体操作可参考梁德澎的文章 移动端arm cpu优化学习笔记第4弹--内联汇编入门(https://zhuanlan.zhihu.com/p/143328317)
4. 其他
余下的是相关的研讨会视频、库和案例。第一个视频帮助我建立了优化分析思维,值得推荐。
(1)研讨会视频 "Performance Analysis for Optimizing Embedded Deep Learning Inference Software," a Presentation from Arm - Edge AI and Vision Alliance
https://www.edge-ai-vision.com/2019/07/performance-analysis-for-optimizing-embedded-deep-learning-inference-software-a-presentation-from-arm/
(2)研讨会视频 LCU14-504: Taming ARMv8 NEON: from theory to benchmark results
https://www.youtube.com/watch?v=ixuDntaSnHIwww.youtube.com
(3)研讨会视频 HKG15-408: ARM v8-A NEON optimization
https://www.youtube.com/watch?v=NYFzidaS3Z4www.youtube.com
(4)Ne10(ARM官方的计算库):
https://github.com/projectNe10/Ne10
(5)Arm Optimized Routines(ARM官方的计算、网络、字符串库):
https://github.com/ARM-software/optimized-routines
(6)Neon优化Chromium的案例
https://developer.arm.com/documentation/101964/developer.arm.com
欢迎关注GiantPandaCV, 在这里你将看到独家的深度学习分享,坚持原创,每天分享我们学习到的新鲜知识。( • ̀ω•́ )✧
有对文章相关的问题,或者想要加入交流群,欢迎添加BBuf微信:

为了方便读者获取资料以及我们公众号的作者发布一些Github工程的更新,我们成立了一个QQ群,二维码如下,感兴趣可以加入。

本文分享自微信公众号 - GiantPandaCV(BBuf233)。
如有侵权,请联系 support@oschina.cn 删除。
本文参与“OSC源创计划”,欢迎正在阅读的你也加入,一起分享。
C 常用函数介绍
1. strcpy
char *strcpy(char *destin, char *source);
功能:将 source 指向的字符串拷到 destin。
1 int main()
2 {
3
4 char dest[5];
5 char *src="123456";
6 strcpy(dest, src);
7 printf("dest= %s, %s, %s", dest, dest+4, dest+5);
8
9 return 0;
10 }
从结果可知确实将 src 的内容复制过去了,但是全部复制导致 dest 满了,使用不当就会出错!
2. strncpy
char *strncpy(char *destin, char *source, int len);
功能:将 source 指向的 len 个字符串拷到 destin。
1 int main()
2 {
3
4 char dest[5];
5 char *src="123456";
6 strncpy(dest, src, 3);
7 dest[3]= ''\0'';
8 printf("dest= %s, %s, %s", dest, dest+4, dest+5);
9
10 return 0;
11 }
结果可知加上‘\0’结束符后 dest 内容变的更安全,strcpy 和 strncpy 要额外加字符结束符!
3. strcat
char* strcat(char * str1,char * str2);
功能:把字符串 str2 接到 str1 后面,str1 最后的 ''\0'' 被取消
1 int main()
2 {
3
4 char dest[5]="abcd";
5 char *src="123456";
6 strcat(dest, src);
7 printf("dest= %s", dest);
8
9 return 0;
10 }
4. strncat
char *strncat(char *dest, const char *src, size_t maxlen)
功能:将字符串 src 中前 maxlen 个字符连接到 dest 中
1 int main()
2 {
3
4 char dest[10]="abcd";
5 char *src="123456";
6 strncat(dest, src, 8);
7 printf("dest= %s", dest);
8
9 return 0;
10 }
与 strncpy 不同,strncat 会自动在末尾加‘\0’,若指定长度超过源字符串长度,则只复制源字符串长度即停止,更安全!
5. strcmp
int strcmp(char * str1,char * str2);
功能:比较两个字符串 str1,str2
返回: str1<str2, 返回负数;str1=str2, 返回 0;str1>str2, 返回正数
1 int main()
2 {
3
4 char dest[10]="abcd";
5 char *src="a23456";
6 char d2[8]="abcd";
7 int res;
8 res=strcmp(dest, src);
9 printf("res= %d \n", res);
10 res=strcmp(dest, d2);
11 printf("res= %d \n", res);
12
13 return 0;
14 }
结果可知每一位都要比较,且与原字符数组长度无关。
6. strncmp
int strncmp(char *str1,char *str2,int count)
功能:对 str1 和 str2 中的前 count 个字符按字典顺序比较
返回:小于 0:str1<str2,等于 0:str1=str2,大于 0:str1>str2
int main()
{
char dest[10]="abcd";
char *src="a23456";
char d2[8]="abcd";
int res;
res=strncmp(dest, src, 1);
printf("res= %d \n", res);
res=strncmp(dest, d2, 1);
printf("res= %d \n", res);
return 0;
}
7. strchr
char* strchr(char* str,char ch);
功能:找出 str 指向的字符串中第一次出现字符 ch 的位置
返回:返回指向该位置的指针,如找不到,则返回空指针
1 int main()
2 {
3
4 char dest[10]="abcd";
5 char* rp;
6 char ch= ''c'';
7 rp=strchr(dest, ch);
8 if(NULL == rp)
9 printf("no %c exist", ch);
10 else
11 printf("pos of %c is %d", ch, (int)(rp-dest+1));
12
13 return 0;
14 }
8. strrchr
char *strrchr(const char *s, int c)
功能:得到字符串 s 中最后一个含有 c 字符的位置指针
返回:位置指针
1 int main()
2 {
3
4 char dest[10]="abcdabc";
5 char* rp;
6 char ch= ''c'';
7 rp=strrchr(dest, ch);
8 if(NULL == rp)
9 printf("no %c exist", ch);
10 else
11 printf("pos of %c is %d", ch, (int)(rp-dest+1));
12
13 return 0;
14 }
strrchr 比 strchr 多的 r 意指反向寻找,位置都是从 1 开始计数(非从 0 开始)
9. strstr
char* strstr(char* str1,char* str2);
功能:找出 str2 字符串在 str1 字符串中第一次出现的位置 (不包括 str2 的串结束符)
返回:返回该位置的指针,如找不到,返回空指针
1 int main()
2 {
3
4 char dest[10]="abcdabc";
5 char* rp;
6 char ch1[]= "c";
7 char str2[]= "cda";
8 rp=strstr(dest, ch1);
9 if(NULL == rp)
10 printf("no %s exist", ch1);
11 else
12 printf("substring is %s \n", rp);
13
14 rp=strstr(dest, str2);
15 if(NULL == rp)
16 printf("no %s exist", str2);
17 else
18 printf("substring is %s ", rp);
19
20 return 0;
21 }
可以找单个字符串(字符不符合参数要求)
10. strnset
char *strnset(char *s, int ch, size_t n)
功能:将字符串 s 中前 n 个字符设置为 ch 的值
返回:指向 s 的指针
1 int main()
2 {
3
4 char dest[10]="abcdabc";
5 char* rp;
6 char ch= ''F'';
7 rp=strnset(dest, ch, 4);
8 printf("after strnset dest is %s \n", rp);
9
10 return 0;
11 }
11. strset
char *strset(char *s, int ch)
功能:将字符串 s 中所有字符设置为 ch 的值
返回:指向 s 的指针
1 int main()
2 {
3
4 char dest[10]="abcdabc";
5 char* rp;
6 char ch= ''F'';
7 rp=strset(dest, ch);
8 printf("after strnset dest is %s \n", rp);
9 printf("after strnset dest is %s \n", dest);
10 return 0;
11 }
结果的 rp 和 dest 都被修改为同一内容!
12. strtok
char *strtok(char *s1, const char *s2)
功能:分解 s1 字符串,用特定分隔符 (s2) 分隔成多个字符串
返回:字符串 s1 中首次出现 s2 中的字符前的子字符串指针
strtok () 在参数 s1 的字符串中发现参数 s2 中包涵的分割字符时,则会将该字符改为 \0 字符。在第一次调用时,strtok () 必需给予参数 s1 字符串,往后的调用则将参数 s1 设置成 NULL。每次调用成功则返回指向被分割出片段的指针。
1 int main()
2 {
3
4 char dest[]="ab,cd,ef,c";
5 char* rp;
6 char ch[]= ",";
7 rp=strtok(dest, ch);
8 while(NULL != rp)
9 {
10 printf("dest: %s ", dest);
11 printf("rp: %s \n", rp);
12 rp=strtok(NULL, ch);
13 }
14
15 return 0;
16 }
说明:尽量使用可重入版的 strtok,Windows 平台下为 strtok_s,Linux 平台下为 strtok_r。
牢记 strtok 函数族的分隔规则:忽略字符串前后的分隔符,连续的分隔符被当做一个处理。
在使用 strtok 前,请对源字符串进行备份,除非你可以接受字符串被修改这一事实(修改为分隔的第一个字符串)。
13. strupr
char *strupr(char *s)
功能:将字符串 s 中的字符变为大写
1 int main()
2 {
3
4 char dest[]="ab,cd,EF,c";
5 char* rp;
6 rp=strupr(dest);
7 printf("dest: %s, rp: %s", dest, rp);
8
9 return 0;
10 }
原字符串 dest 也被修改!!,对符号和大写字符无影响。
char *strlwr (char *s) 与它相反,将字符串中的字符变为小写字符
还有一些 memxxx () 函数下次单独说明,有问题欢迎评论~~
cocoa – – [NSTextField intrinsicContentSize]始终具有未定义的宽度
如果我以编程方式创建textField,如下所示:
_textfield = [[NSTextField alloc] initWithFrame:CGRectZero]; _textfield.translatesAutoresizingMaskIntoConstraints = NO; _textfield.alignment = NSCenterTextAlignment; _textfield.drawsBackground = NO; [_textfield setBordered:NO]; _textfiled.stringValue = @"Test"
它的intrinsicContentSize始终是(width = -1,height = 16)
我试过调用invalidateIntrinsicContentSize但无济于事……
如何让NSTextField的intrinsicContentSize工作?我必须做一些与xib不同的事情,我无法弄清楚是什么.
解决方法
[_textfield setEditable:NO]
而已.我想,对于可编辑的文本字段,必须对文本字段宽度有明确的约束.哪种方式有意义,想象一下编辑一个文本字段,它会随着每次按键不断地水平增长……不是理想的用户界面.
关于SSE2 Intrinsics各函数介绍和ss()函数的问题我们已经讲解完毕,感谢您的阅读,如果还想了解更多关于android.graphics.Paint.FontMetricsInt的实例源码、ARM Neon Intrinsics 学习指北:从入门、进阶到学个通透、C
本文标签: