linux C编程：重写 memcpy 函数

阅读：评论：0

1、地址重叠

2、SSE指令效率提升

实现 memcpy 是各个大厂都喜欢出的面试题，主要是因为memcpy存在两个考点：地址重叠、效率提升。由于linux的标准C库中的接口并未解决这两个问题，加上内存拷贝是非常常用的库函数，因此备受面试官的青睐。

1、地址重叠

linux 用户手册说明 memcpy 不支持地址重叠，加入地址发生重叠则需要通过 memmove来代替，先来看一下memcpy的源码实现：

/*** memcpy - Copy one area of memory to another* @dest: Where to copy to* @src: Where to copy from* @count: The size of the area.** You should not use this function to access IO space, use memcpy_toio()* or memcpy_fromio() instead.*/
void *memcpy(void *dest, const void *src, size_t count)
{char *tmp = dest;const char *s = src;while (count--)*tmp++ = *s++;return dest;
}

可以看到 memcpy 就是简单的从前向后依次拷贝每一个字节内容，当源、目的地址没有重叠则没有问题，当发生重叠时，memcpy肯定会带来一下问题。首先需要明确的是，地址重叠存在两种情况：

1、地址向前重叠，即 src > dest && dest + size > src，如下所示：

使用 memcpy 可以正确拷贝，但是不可避免的是拷贝完成后src所指向的内容肯定发生改变。

2、地址向后重叠，即 dest > src && src+ size > dest，如下所示：

使用 memcpy 无法正确拷贝，因为src所指向的内存空间后面部分数据被新拷贝的数据给覆盖了，所以拷贝到最后，数据肯定不是原来的数据。当地址发生了向后重叠需要通过memmove来代替。memmove主要是判断地址判断一下地址是否存在向后重叠，Y:从后向前拷贝；N：从前向后拷贝。

/*** memmove - Copy one area of memory to another* @dest: Where to copy to* @src: Where to copy from* @count: The size of the area.** Unlike memcpy(), memmove() copes with overlapping areas.*/
void *memmove(void *dest, const void *src, size_t count)
{char *tmp;const char *s;if (dest <= src) {tmp = dest;s = src;while (count--)*tmp++ = *s++;} else {tmp = dest;tmp += count;s = src;s += count;while (count--)*--tmp = *--s;}return dest;
}

因此解决地址重叠则仅需要看一下，地址重叠方式然后判断是从前向后拷贝或是从后向前拷贝即可。具体代码如下：

void *memcpy_func(void *dest, const void *src, size_t n)
{if(dest ==  NULL || src == NULL || n <= 0){return NULL;}char *p_dst = (char *)dest;char *p_src = (char *)src;if((p_src + n > p_dst) && (p_dst > p_src)){//地址向后重叠p_dst += n;p_src += n;while(n-- > 0){p_dst--;p_src--;*p_dst = *p_src;}}else{while(n-- > 0){*p_dst++ = *p_src++;}}return (char *)dest;
}

2、SSE指令效率提升

单个的字节拷贝肯定时非常傻瓜的，当n值较小时肯定不会存在啥问题，当需要拷贝的字节数达到M级时，一次性拷贝大量的大量的数据无疑是最佳的选择。借助SSE指令将汇编代码嵌入到C中实现一次性拷贝16、32、64、128、256等大小的字节数，代码如下：

/*** Copy 16 bytes from one location to another using optimised SSE* instructions. The locations should not overlap.** @param s1*   Pointer to the destination of the data.* @param s2*   Pointer to the source data.*/
static inline void
mov16(uint8_t *dst, const uint8_t *src)
{asm volatile ("movdqu (%[src]), %%xmm0nt""movdqu %%xmm0, (%[dst])nt":: [src] "r" (src),[dst] "r"(dst): "xmm0", "memory");
}/*** Copy 32 bytes from one location to another using optimised SSE* instructions. The locations should not overlap.** @param s1*   Pointer to the destination of the data.* @param s2*   Pointer to the source data.*/
static inline void
mov32(uint8_t *dst, const uint8_t *src)
{asm volatile ("movdqu (%[src]), %%xmm0nt""movdqu 16(%[src]), %%xmm1nt""movdqu %%xmm0, (%[dst])nt""movdqu %%xmm1, 16(%[dst])":: [src] "r" (src),[dst] "r"(dst): "xmm0", "xmm1", "memory");
}/*** Copy 48 bytes from one location to another using optimised SSE* instructions. The locations should not overlap.** @param s1*   Pointer to the destination of the data.* @param s2*   Pointer to the source data.*/
static inline void
mov48(uint8_t *dst, const uint8_t *src)
{asm volatile ("movdqu (%[src]), %%xmm0nt""movdqu 16(%[src]), %%xmm1nt""movdqu 32(%[src]), %%xmm2nt""movdqu %%xmm0, (%[dst])nt""movdqu %%xmm1, 16(%[dst])""movdqu %%xmm2, 32(%[dst])":: [src] "r" (src),[dst] "r"(dst): "xmm0", "xmm1", "memory");
}/*** Copy 64 bytes from one location to another using optimised SSE* instructions. The locations should not overlap.** @param s1*   Pointer to the destination of the data.* @param s2*   Pointer to the source data.*/
static inline void
mov64(uint8_t *dst, const uint8_t *src)
{asm volatile ("movdqu (%[src]), %%xmm0nt""movdqu 16(%[src]), %%xmm1nt""movdqu 32(%[src]), %%xmm2nt""movdqu 48(%[src]), %%xmm3nt""movdqu %%xmm0, (%[dst])nt""movdqu %%xmm1, 16(%[dst])nt""movdqu %%xmm2, 32(%[dst])nt""movdqu %%xmm3, 48(%[dst])":: [src] "r" (src),[dst] "r"(dst): "xmm0", "xmm1", "xmm2", "xmm3","memory");
}/*** Copy 128 bytes from one location to another using optimised SSE* instructions. The locations should not overlap.** @param s1*   Pointer to the destination of the data.* @param s2*   Pointer to the source data.*/
static inline void
mov128(uint8_t *dst, const uint8_t *src)
{asm volatile ("movdqu (%[src]), %%xmm0nt""movdqu 16(%[src]), %%xmm1nt""movdqu 32(%[src]), %%xmm2nt""movdqu 48(%[src]), %%xmm3nt""movdqu 64(%[src]), %%xmm4nt""movdqu 80(%[src]), %%xmm5nt""movdqu 96(%[src]), %%xmm6nt""movdqu 112(%[src]), %%xmm7nt""movdqu %%xmm0, (%[dst])nt""movdqu %%xmm1, 16(%[dst])nt""movdqu %%xmm2, 32(%[dst])nt""movdqu %%xmm3, 48(%[dst])nt""movdqu %%xmm4, 64(%[dst])nt""movdqu %%xmm5, 80(%[dst])nt""movdqu %%xmm6, 96(%[dst])nt""movdqu %%xmm7, 112(%[dst])":: [src] "r" (src),[dst] "r"(dst): "xmm0", "xmm1", "xmm2", "xmm3","xmm4", "xmm5", "xmm6", "xmm7", "memory");
}/*** Copy 256 bytes from one location to another using optimised SSE* instructions. The locations should not overlap.** @param s1*   Pointer to the destination of the data.* @param s2*   Pointer to the source data.*/
static inline void
mov256(uint8_t *dst, const uint8_t *src)
{/** There are 16XMM registers, but this function does not use* them all so that it can still be compiled as 32bit* code. The performance increase was neglible if all 16* registers were used.*/mov128(dst, src);mov128(dst + 128, src + 128);
}/*** Copy bytes from one location to another. The locations should not overlap.** @param s1*   Pointer to the destination of the data.* @param s2*   Pointer to the source data.* @param n*   Number of bytes to copy.* @return*   s1*/
void * fast_memcpy(void *s1, const void *s2, size_t n)
{uint8_t *dst = (uint8_t *)s1;const uint8_t *src = (const uint8_t *)s2;/* We can't copy < 16 bytes using XMM registers so do it manually. */if (n < 16) {if (n & 0x01) {*dst = *src;dst += 1;src += 1;}if (n & 0x02) {*(uint16_t *)dst = *(const uint16_t *)src;dst += 2;src += 2;}if (n & 0x04) {*(uint32_t *)dst = *(const uint32_t *)src;dst += 4;src += 4;}if (n & 0x08) {*(uint64_t *)dst = *(const uint64_t *)src;}return dst;}/* Special fast cases for <= 128 bytes */if (n <= 32) {mov16(dst, src);mov16(dst - 16 + n, src - 16 + n);return s1;}if (n <= 64) {mov32(dst, src);mov32(dst - 32 + n, src - 32 + n);return s1;}if (n <= 128) {mov64(dst, src);mov64(dst - 64 + n, src - 64 + n);return s1;}/** For large copies > 128 bytes. This combination of 256, 64 and 16 byte* copies was found to be faster than doing 128 and 32 byte copies as* well.*/for (; n >= 256; n -= 256, dst += 256, src += 256) {mov256(dst, src);}/** We split the remaining bytes (which will be less than 256) into* 64byte (2^6) chunks.* Using incrementing integers in the case labels of a switch statement* enourages the compiler to use a jump table. To get incrementing* integers, we shift the 2 relevant bits to the LSB position to first* get decrementing integers, and then subtract.*/switch (3 - (n >> 6)) {case 0x00:mov64(dst, src);n -= 64;dst += 64;src += 64;       /* fallthrough */case 0x01:mov64(dst, src);n -= 64;dst += 64;src += 64;       /* fallthrough */case 0x02:mov64(dst, src);n -= 64;dst += 64;src += 64;       /* fallthrough */default:;}/** We split the remaining bytes (which will be less than 64) into* 16byte (2^4) chunks, using the same switch structure as above.*/switch (3 - (n >> 4)) {case 0x00:mov16(dst, src);n -= 16;dst += 16;src += 16;       /* fallthrough */case 0x01:mov16(dst, src);n -= 16;dst += 16;src += 16;       /* fallthrough */case 0x02:mov16(dst, src);n -= 16;dst += 16;src += 16;       /* fallthrough */default:;}/* Copy any remaining bytes, without going beyond end of buffers */if (n != 0) {mov16(dst - 16 + n, src - 16 + n);}return s1;
}

需要说明的是：对应数量级较小字节的拷贝，该接口的效率并不高，且该种实现方式依然不支持地址向后重叠。

本文发布于:2024-02-05 03:46:33，感谢您对本站的认可！

本文链接：https://www.4u4v.net/it/170723314862786.html

上一篇：索尼推出“出发！探索编程世界™教育版”及toio™ 教育教学解决方案

下一篇：索尼 toio™ 应用创意开发征文

标签：重写函数 linux memcpy

留言与评论（共有 0 条评论）