关于使用MMX/SSE技术优化memcpy的尝试

原创已于 2022-03-20 22:10:08 修改 · 7.5k 阅读

2 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#优化 #性能优化 #null #struct #buffer

于 2010-08-18 09:28:00 首次发布

本文探讨了尝试使用MMX/SSE技术优化memcpy()的性能，通过实验对比了不同实现方式的时间消耗，发现优化后的性能提升有限，且在执行初期可能不如未优化的memcpy。初步结论是MMX/SSE技术对memcpy的性能优化效果不显著，可能受限于汇编知识的掌握。期待更多优化建议。

近来，希望能通过使用某种技术优化常规memcpy()的性能，于是尝试了 MMX/SSE，希望能借此实现一个性能更高的memcpy函数。

代码如下（里面的USE1函数是借用别人的，但性能也不怎么样）：

#include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sys/time.h> #define LEN 100*1024*1024 #define USE1 class TimeUse{ public: TimeUse(char * cMsg) { memset(m_cMsg, 0, sizeof(m_cMsg)); strncpy(m_cMsg, cMsg, strlen(cMsg)); gettimeofday(&tTime1, NULL); } ~TimeUse() { gettimeofday(&tTime2, NULL); unsigned long ulDiff = (tTime2.tv_sec-tTime1.tv_sec)*1000 + (tTime2.tv_usec-tTime1.tv_usec)/1000; printf("%s Use %ld ms/n", m_cMsg, ulDiff); } private: struct timeval tTime1, tTime2; char m_cMsg[255]; }; #ifdef USE0 /*100M耗时约85ms*/ static inline void * memcopy(void *dest, const void *src, int size) { int i, n, len, iCount; char * to = (char *)dest; char *from = (char *)src; n = size; len = size; char cFSave[108]; { int i; #if 0 __asm__ __volatile__ ( "1: prefetchnta 128(%0)/n" : : "r" (from) ); #endif /*开始MMX之前要保存FPS*/ iCount = (len/64); if(iCount > 0) { __asm__( ".lcomm buffer, 108/n" "fsave buffer/n" "loop:/n" "movq (%0), %%mm0/n" "movq 8(%0), %%mm1/n" "movq 16(%0), %%mm2/n" "movq 24(%0), %%mm3/n" "movq 32(%0), %%mm4/n" "movq 40(%0), %%mm5/n" "movq 48(%0), %%mm6/n" "movq 56(%0), %%mm7/n" "movntq %%mm0, (%1)/n" "movntq %%mm1, 8(%1)/n" "movntq %%mm2, 16(%1)/n" "movntq %%mm3, 24(%1)/n" "movntq %%mm4, 32(%1)/n" "movntq %%mm5, 40(%1)/n" "movntq %%mm6, 48(%1)/n" "movntq %%mm7, 56(%1)/n" "addl $64,%0/n" "addl $64,%1/n" "dec %2/n" "jnz loop/n" "frstor buffer/n" "emms/n" : : "a" (from), "b" (to), "c" (iCount) : "memory"); } if (len%64) { memcpy(to+(len/64)*64, from+(len/64)*64, len%64); } return to; } } #endif #ifdef USE1 /*http://people.redhat.com/mingo/mmx-patches/mmx-2.3.99-A0 * * http://mail-index.netbsd.org/tech-perform/2002/10/23/0004.html */ /*100M耗时约85ms*/ static inline void * memcopy(void *dest, const void *src, int size) { int i, n; char * to = (char *)dest; char *from = (char *)src; n = size; { size_t size; #define STEP 0x20 #define ALIGN 0x10 if ((unsigned long)to & (ALIGN-1)) { size = ALIGN - ((unsigned long)to & (ALIGN-1)); __asm__ __volatile__("movups (%0),%%xmm0/n/t" "movups %%xmm0,(%1)/n/t" : : "r" (from), "r" (to)); n -= size; from += size; to += size; } /* * If the copy would have tailings, take care of them * now instead of later */ if (n & (ALIGN-1)) { size = n - ALIGN; __asm__ __volatile__("movups (%0),%%xmm0/n/t" "movups %%xmm0,(%1)/n/t" : : "r" (from + size), "r" (to + size)); n &= ~(ALIGN-1); } /* * Prefetch the first two cachelines now. */ __asm__ __volatile__("prefetchnta 0x00(%0)/n/t" "prefetchnta 0x20(%0)/n/t" : : "r" (from)); while (n >= STEP) { __asm__ __volatile__( "movups 0x00(%0),%%xmm0/n/t" "movups 0x10(%0),%%xmm1/n/t" "movntps %%xmm0,0x00(%1)/n/t" "movntps %%xmm1,0x10(%1)/n/t" : : "r" (from), "r" (to) : "memory"); from += STEP; /* * Note: Intermixing the prefetch at *exactly* this point * in time has been shown to be the fastest possible. * Timing these prefetch instructions is a complete black * art with nothing but trial and error showing the way. * To that extent, this optimum version was found by using * a userland version of this routine that we clocked for * lots of runs. We then fiddled with ordering until we * settled on our highest speen routines. So, the long * and short of this is, don't mess with instruction ordering * here or suffer permance penalties you will. */ __asm__ __volatile__( "prefetchnta 0x20(%0)/n/t" : : "r" (from)); to += STEP; n -= STEP; } return to; } } #endif #ifdef USE2 /*100M耗时约85ms*/ static inline void * memcopy(void *dest, const void *src, int size) { int i, n, len; char * to = (char *)dest; char *from = (char *)src; n = size; len = size; { int i; __asm__ __volatile__ ( "1: prefetchnta (%0)/n" "prefetchnta 64(%0)/n" "prefetchnta 128(%0)/n" "prefetchnta 192(%0)/n" : : "r" (from) ); for(i=0; i<len/64; i++) { __asm__ __volatile__ ( "prefetchnta 168(%0)/n" "movq (%0), %%mm0/n" "movntq %%mm0, (%1)/n" "movq 8(%0), %%mm1/n" "movntq %%mm1, 8(%1)/n" "movq 16(%0), %%mm2/n" "movntq %%mm2, 16(%1)/n" "movq 24(%0), %%mm3/n" "movntq %%mm3, 24(%1)/n" "movq 32(%0), %%mm4/n" "movntq %%mm4, 32(%1)/n" "movq 40(%0), %%mm5/n" "movntq %%mm5, 40(%1)/n" "movq 48(%0), %%mm6/n" "movntq %%mm6, 48(%1)/n" "movq 56(%0), %%mm7/n" "movntq %%mm7, 56(%1)/n" : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; } if (len&63) memcpy(to, from, len&63); return to; } } #endif /*用法: ./test 0; ./test 1*/ int main(int argc, char ** argv) { char * pcSrc = NULL; char * pcDst = NULL; char * pcSrc1 = NULL; char * pcDst1 = NULL; char * pcSrc2 = NULL; char * pcDst2 = NULL; int iChoice = 0; int a,b,c,d; float * pfData = NULL; if(argc > 2) { printf("Usage: './test 0' to use memcpy; or './test 1' to use memcopy/n"); } if(argc == 2) { sscanf(argv[1], "%d", &iChoice); printf("your choice is %d/n", iChoice); } pcSrc = new char[LEN]; pcDst = new char[LEN]; pcSrc1 = new char[LEN]; pcDst1 = new char[LEN]; pcSrc2 = new char[LEN]; pcDst2 = new char[LEN]; pfData = new float[LEN]; int iLoop = 0; while((iLoop++) <= 255) { //TimeUse t("Loop"); { { int * piTemp = NULL; piTemp = (int *)(pcDst); TimeUse t("=="); for(int iTemp=0; iTemp<LEN/4;iTemp++) { *piTemp++=123; } } { TimeUse t("memset"); memset(pcSrc, iLoop, LEN); } memset(pcSrc1, iLoop, LEN); memset(pcSrc2, iLoop, LEN); for(int iLoop2=0; iLoop2<LEN; iLoop2++) { pfData[iLoop2]=1.0123456789+iLoop; } } if(iChoice == 0) { TimeUse t("memcpy"); memcpy(pcDst, pcSrc, LEN); } else { { TimeUse t("memcopy0"); memcopy(pcDst, pcSrc, LEN); } { TimeUse t("memcopy1"); //memcopy(pcDst1, pcSrc1, LEN); } { TimeUse t("memcopy2"); // memcopy(pcDst2, pcSrc2, LEN); } } usleep(20000); } return 0; }

试验结果：
1. 未优化，memcpy 100M数据：
[root@localhost opt]# ./test
memcpy Use 94 ms
memcpy Use 61 ms
memcpy Use 61 ms
memcpy Use 61 ms
memcpy Use 62 ms
memcpy Use 61 ms
memcpy Use 61 ms
memcpy Use 61 ms

2. 使用MMX/SSE优化，memcpy 100M数据：
[root@localhost opt]# ./test 1
your choice is 1
memcopy0 Use 110 ms
memcopy1 Use 110 ms
memcopy2 Use 110 ms
memcopy0 Use 40 ms
memcopy1 Use 42 ms
memcopy2 Use 40 ms
memcopy0 Use 48 ms
memcopy1 Use 40 ms
memcopy2 Use 41 ms
memcopy0 Use 40 ms
memcopy1 Use 40 ms

初步结论：
使用MMS/SSE内存技术对memcpy的性能优化空间不太大，而且在执行初期，优化的性能甚至比不上未优化的性能。
从原理上讲，SSE会比MMX快，MMX会比常规memcpy快。可能受限于AT&T汇编掌握程度，暂时未能给出理想的优化结果。
如果谁有更好的想法，欢迎随时交流。