SSE2+Instructions+to+Include

x86 and x86-64 - SSE2 C-integer intrinsic reference from [|MSDN Library], the intrinsic data type **_m128i** refers a xmm-register or memory location: of bytes into a word ||> _m128i || [|_mm_sad_epu8] || (_m128i a, _m128i b) ||
 * ~ Mnemonic ||~ Description ||~  ||~ C-Intrinsic ||~   ||
 * ~ bitwise logical ||~ return ||~  ||~ parameter ||
 * **pand** || packed and, r := a & b ||> _m128i || [|_mm_and_si128] || (_m128i a, _m128i b) ||
 * **pandn** || packed and not, r := ~a & b ||> _m128i || [|_mm_andnot_si128] || (_m128i a, _m128i b) ||
 * **por** || packed or, r := a | b ||> _m128i || [|_mm_or_si128] || (_m128i a, _m128i b) ||
 * **pxor** || packed xor, r:= a ^ b ||> _m128i || [|_mm_xor_si128] || (_m128i a, _m128i b) ||
 * ~ quad word shifts ||>  ||~   ||   ||
 * **psrlq** || packed shift right logical quad ||> _m128i || [|_mm_srl_epi64] || (_m128i a, _m128i cnt) ||
 * || immediate ||> _m128i || [|_mm_srli_epi64] || (_m128i a, int cnt) ||
 * **psllq** || packed shift left logical quad ||> _m128i || [|_mm_sll_epi64] || (_m128i a, _m128i cnt) ||
 * || immediate ||> _m128i || [|_mm_slli_epi64] || (_m128i a, int cnt) ||
 * ~ arithmetical ||>  ||~   ||   ||
 * **paddb** || packed add bytes ||> _m128i || [|_mm_add_epi8] || (_m128i a, _m128i b) ||
 * **psubb** || packed subtract bytes ||> _m128i || [|_mm_sub_epi8] || (_m128i a, _m128i b) ||
 * **psadbw** || packed sum of absolute differences
 * **pmaxsw** || packed maximum signed words ||> _m128i || [|_mm_max_epi16] || (_m128i a, _m128i b) ||
 * **pmaxub** || packed maximum unsigned bytes ||> _m128i || [|_mm_max_epu8] || (_m128i a, _m128i b) ||
 * **pminsw** || packed minimum signed words ||> _m128i || [|_mm_min_epi16] || (_m128i a, _m128i b) ||
 * **pminub** || packed minimum unsigned bytes ||> _m128i || [|_mm_min_epu8] || (_m128i a, _m128i b) ||
 * **pcmpeqb** || packed compare equal bytes ||> _m128i || [|_mm_cmpeq_epi8] || (_m128i a, _m128i b) ||
 * **pmullw** || packed multiply mow signed (unsigned) word || _m128i || [|_mm_mullo_epi16] || (_m128i a, _m128i b) ||
 * **pmulhw** || packed multiply high signed word || _m128i || [|_mm_mulhi_epi16] || (_m128i a, _m128i b) ||
 * **pmulhuw** || packed multiply high unsigned word || _m128i || [|_mm_mulhi_epu16] || (_m128i a, _m128i b) ||
 * **pmaddwd** || packed multiply words and add doublewords || _m128 || [|_mm_madd_epi16] || (_m128i a, _m128i b) ||
 * ~ unpack, shuffle ||>  ||~   ||   ||
 * **punpcklbw** || unpack and interleave low bytes

||> _m128i || [|_mm_unpacklo_epi8] || (_m128i A, _m128i a) ||
 * **punpckhbw** || unpack and interleave high bytes

||> _m128i || [|_mm_unpackhi_epi8] || (_m128i A, _m128i a) || ||> _m128i || [|_mm_unpacklo_epi16] || (_m128i A, _m128i a) || ||> _m128i || [|_mm_unpackhi_epi16] || (_m128i A, _m128i a) || ||> _m128i || [|_mm_unpacklo_epi32] || (_m128i A, _m128i a) || ||> _m128i || [|_mm_unpackhi_epi32] || (_m128i A, _m128i a) || ||> _m128i || [|_mm_unpacklo_epi64] || (_m128i A, _m128i a) || ||> _m128i || [|_mm_unpackhi_epi64] || (_m128i A, _m128i a) || xmm := *p ||> _m128i || [|_mm_load_si128] || (_m128i *p) || xmm := *p ||> _m128i || [|_mm_loadu_si128] || (_m128i *p) || xmm := gp64 ||> _m128i || [|_mm_cvtsi64x_si128] || (_int64 value) || gp32 := 16 sign-bits(xmm) ||> int || [|_mm_movemask_epi] || (_m128i a) ||
 * **punpcklwd** || unpack and interleave low words
 * **punpckhwd** || unpack and interleave high words
 * **punpckldq** || unpack and interleave low doublewords
 * **punpckhdq** || unpack and interleave high doublewords
 * **punpcklqdq** || unpack and interleave low quadwords
 * **punpckhqdq** || unpack and interleave high quadwords
 * **pshuflw** || packed shuffle low words ||> _m128i || [|_mm_shufflelo_epi16] || (_m128i a, int imm) ||
 * **pshufhw** || packed shuffle high words ||> _m128i || [|_mm_shufflehi_epi16] || (_m128i a, int imm) ||
 * **pshufd** || packed shuffle doublewords ||> _m128i || [|_mm_shuffle_epi32] || (_m128i a, int imm) ||
 * ~ load, store, moves ||>  ||~   ||   ||
 * **movdqa** || move aligned double quadword
 * **movdqu** || move unaligned double quadword
 * **movdqa** || move aligned double quadword
 * p := xmm ||> void || [|_mm_store_si128] || (_m128i *p, _m128i a) ||
 * **movdqu** || move unaligned double quadword
 * p := xmm ||> void || [|_mm_storeu_si128] || (_m128i *p, _m128i a) ||
 * **movq** || move quadword, xmm := gp64 ||> _m128i || [|_mm_cvtsi64_si128] || (_int64 a) ||
 * **movq** || move quadword, gp64 := xmm ||> _int64 || [|_mm_cvtsi128_si64] || (_m128i a) ||
 * **movd** || move double word or quadword
 * **movd** || move doubleword, xmm := gp32 ||> _m128i || [|_mm_cvtsi32_si128] || (int a) ||
 * **movd** || move doubleword, gp32 := xmm ||> int || [|_mm_cvtsi128_si32] || (_m128i a) ||
 * **pextrw** || extract packed word, gp16 := xmm[i] ||> int || [|_mm_extract_epi16] || (_m128i a, int imm) ||
 * **pinsrw** || packed insert word, xmm[i] := gp16 ||> _m128i || [|_mm_insert_epi16] || (_m128i a, int b, int imm) ||
 * **pmovmskb** || packed move mask byte,
 * ~ cache support ||>  ||~   ||   ||
 * **prefetch** ||  || void || [|_mm_prefetch] || (char * p, int i ) ||