#ifndef __SSE3__ #include #endif #define LBLOCK(n) \ "movapd %%xmm0, %%xmm2\n\t" \ "pslldq $" #n ", %%xmm2\n\t" \ "pcmpeqb %%xmm1, %%xmm2\n\t" \ "psrldq $" #n ", %%xmm2\n\t" \ "pmovmskb %%xmm2, %%ecx\n\t" \ "mov %%ecx, %%edx\n\t" \ "shl $1, %%edx\n\t" \ "and %%edx, %%ecx\n\t" \ "shl $1, %%edx\n\t" \ "and %%edx, %%ecx\n\t" \ "or %%ecx, %0\n\t" \ #define RBLOCK(n) \ "movapd %%xmm1, %%xmm2\n\t" \ "pslldq $" #n ", %%xmm2\n\t" \ "pcmpeqb %%xmm0, %%xmm2\n\t" \ /* necessary to avoid a glitch */ \ "psrldq $" #n ", %%xmm2\n\t" \ "pmovmskb %%xmm2, %%ecx\n\t" \ "mov %%ecx, %%edx\n\t" \ "shl $1, %%edx\n\t" \ "and %%edx, %%ecx\n\t" \ "shl $1, %%edx\n\t" \ "and %%edx, %%ecx\n\t" \ "or %%ecx, %0\n\t" \ #ifdef USE_LCS5 int lcs_32_rough(const uint32_t * const str1, const uint32_t * const str2) { register unsigned int x; #ifndef __SSE3__ static uint32_t buf1[4] __attribute__((aligned(16))); static uint32_t buf2[4] __attribute__((aligned(16))); size_t i; for (i=0; i<4; i++) { buf1[i^3] = bswap_32(str1[i]); buf2[i^3] = bswap_32(str2[i]); } #else /* this is an endian swap, for use with pshufb */ static uint8_t mask[16] = {0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00}; const uint32_t * const buf1 = str1; const uint32_t * const buf2 = str2; #endif /* the pointers are used in the asm below, but there aren't enough registers in 32-bit mode to make this work as a constraint on the __asm__ itself. This does the equivalent to prevent reordering, while leaving two extra GPRs. */ /*volatile int foo1 = *str1; volatile int foo2 = *str2;*/ __asm__ volatile("" : : "m"(*str1), "m"(*str2)); __asm__ volatile ( /* moving unmodified ones into place */ "movapd (%1), %%xmm0\n\t" "movapd (%2), %%xmm1\n\t" #ifdef __SSE3__ /* endian swap */ "pshufb (%3), %%xmm0\n\t" "pshufb (%3), %%xmm1\n\t" #endif /* clear the accumulator */ "xor %0, %0\n\t" LBLOCK(0) LBLOCK(1) LBLOCK(2) LBLOCK(3) LBLOCK(4) LBLOCK(5) LBLOCK(6) LBLOCK(7) LBLOCK(8) LBLOCK(9) LBLOCK(10) LBLOCK(11) LBLOCK(12) /* no RBLOCK(0) because it's equivalent to LBLOCK(0) */ RBLOCK(1) RBLOCK(2) RBLOCK(3) RBLOCK(4) RBLOCK(5) RBLOCK(6) RBLOCK(7) RBLOCK(8) RBLOCK(9) RBLOCK(10) RBLOCK(11) RBLOCK(12) /* This is the shortest method I could come up with to shift xmm0 by * 4 bits left. Shift the halves, carrying up upper 4 bits of the * low half up. */ "movapd %%xmm0, %%xmm2\n\t" "psllq $4, %%xmm0\n\t" "psrlq $60, %%xmm2\n\t" /* Shuffle the xmm around (in quarters) -- xmm2 has the 01 or 11 as * zero due to the shift right by 60, so we can use that for the * three we don't care about -- the 69 constant = * [z] [0] [z] [z], where z = 01 or 11, 0b01000101 */ "pshufd $69, %%xmm2, %%xmm2\n\t" /* most operations work here since the same bit position can only be * 1 in one register. add, or or xor ok. */ "pxor %%xmm2, %%xmm0\n\t" LBLOCK(0) LBLOCK(1) LBLOCK(2) LBLOCK(3) LBLOCK(4) LBLOCK(5) LBLOCK(6) LBLOCK(7) LBLOCK(8) LBLOCK(9) LBLOCK(10) LBLOCK(11) LBLOCK(12) RBLOCK(1) RBLOCK(2) RBLOCK(3) RBLOCK(4) RBLOCK(5) RBLOCK(6) RBLOCK(7) RBLOCK(8) RBLOCK(9) RBLOCK(10) RBLOCK(11) RBLOCK(12) "1:" : "=r"(x) : "r"(buf1), "r"(buf2) #ifdef __SSE3__ , "r"(mask) #endif : "xmm0", "xmm1", "xmm2", "xmm3", "edx", "ecx"); //printf("ret %u\n", x); if(unlikely(x != 0)) return 8; return 0; } #endif