项目

一般

简介

内存拷贝 » test.cpp

ly w, 2024-12-16 09:20

 
#include <iostream>
#include <chrono>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <cstring>
#include <cerrno>
#include <getopt.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <time.h>
#include <stdbool.h>
#include <assert.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/poll.h>
#include <unistd.h>
#include <stdbool.h>
#include <sys/eventfd.h>

#include <sched.h>
#include <pthread.h>

#include <stdint.h>
#include <memory.h>
#include <sys/time.h>

#include <arm_neon.h>

#define DMA_HEAP_UNCACHE_PATH "/dev/dma_heap/system-uncached"
#define DMA_HEAP_PATH "/dev/dma_heap/system"
#define DMA_HEAP_DMA32_UNCACHED_PATH "/dev/dma_heap/system-uncached-dma32"
#define DMA_HEAP_DMA32_PATH "/dev/dma_heap/system-dma32"
#define CMA_HEAP_UNCACHED_PATH "/dev/dma_heap/cma-uncached"
#define RV1106_CMA_HEAP_PATH "/dev/rk_dma_heap/rk-dma-heap-cma"
#define DMA_TYPE DMA_HEAP_DMA32_UNCACHED_PATH

int dma_sync_device_to_cpu(int fd);
int dma_sync_cpu_to_device(int fd);

int dma_buf_alloc(const char *path, size_t size, int *fd, void **va);
void dma_buf_free(size_t size, int *fd, void *va);

#define DMA_BUF_SYNC_READ (1 << 0)
#define DMA_BUF_SYNC_WRITE (2 << 0)
#define DMA_BUF_SYNC_RW (DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE)
#define DMA_BUF_SYNC_START (0 << 2)
#define DMA_BUF_SYNC_END (1 << 2)

struct dma_buf_sync {
__u64 flags;
};

#define DMA_BUF_BASE 'b'
#define DMA_BUF_IOCTL_SYNC _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)

#define CMA_HEAP_SIZE 1024 * 1024

typedef unsigned long long __u64;
typedef unsigned int __u32;

struct dma_heap_allocation_data {
__u64 len;
__u32 fd;
__u32 fd_flags;
__u64 heap_flags;
};


#define DMA_HEAP_IOC_MAGIC 'H'
#define DMA_HEAP_IOCTL_ALLOC _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,\
struct dma_heap_allocation_data)

int dma_sync_device_to_cpu(int fd) {
struct dma_buf_sync sync = {0};

sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_RW;
return ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync);
}

int dma_sync_cpu_to_device(int fd) {
struct dma_buf_sync sync = {0};

sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_RW;
return ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync);
}

int dma_buf_alloc(const char *path, size_t size, int *fd, void **va) {
int ret;
int prot;
void *mmap_va;
int dma_heap_fd = -1;
struct dma_heap_allocation_data buf_data;

/* open dma_heap fd */
dma_heap_fd = open(path, O_RDWR);
if (dma_heap_fd < 0) {
printf("open %s fail!\n", path);
return dma_heap_fd;
}

/* alloc buffer */
memset(&buf_data, 0x0, sizeof(struct dma_heap_allocation_data));

buf_data.len = size;
buf_data.fd_flags = O_CLOEXEC | O_RDWR;
ret = ioctl(dma_heap_fd, DMA_HEAP_IOCTL_ALLOC, &buf_data);
if (ret < 0) {
printf("RK_DMA_HEAP_ALLOC_BUFFER failed\n");
return ret;
}

/* mmap va */
if (fcntl(buf_data.fd, F_GETFL) & O_RDWR)
prot = PROT_READ | PROT_WRITE;
else
prot = PROT_READ;

/* mmap contiguors buffer to user */
mmap_va = (void *)mmap(NULL, buf_data.len, prot, MAP_SHARED, buf_data.fd, 0);
if (mmap_va == MAP_FAILED) {
printf("mmap failed: %s\n", strerror(errno));
return -errno;
}

*va = mmap_va;
*fd = buf_data.fd;

close(dma_heap_fd);

return 0;
}

void dma_buf_free(size_t size, int *fd, void *va) {
int len;

len = size;
munmap(va, len);

close(*fd);
*fd = -1;
}

#define SIZE 13 * 1024 * 1024 // 13MB

void neon_memcpy(void* dest, const void* src, size_t n) {
const uint8_t* src_ptr = (const uint8_t*)src;
uint8_t* dest_ptr = (uint8_t*)dest;

// 确保源和目标指针都是16字节对齐
if (((uintptr_t)src_ptr % 16 != 0) || ((uintptr_t)dest_ptr % 16 != 0)) {
// 如果不对齐,先处理未对齐的字节
size_t offset = 0;
while (offset < n && ((uintptr_t)(src_ptr + offset) % 16 != 0 || (uintptr_t)(dest_ptr + offset) % 16 != 0)) {
dest_ptr[offset] = src_ptr[offset];
offset++;
std::cout << "offset: " << offset << std::endl;
}
// 现在 offset 已经对齐,可以使用 NEON 进行拷贝
for (size_t i = offset; i < n / 16 * 16; i += 16) {
uint8x16_t data = vld1q_u8(src_ptr + i);
vst1q_u8(dest_ptr + i, data);
}
// 处理剩余的字节
for (size_t i = (n / 16) * 16; i < n; i++) {
dest_ptr[i] = src_ptr[i];
}
return;
}

// 每次拷贝16字节
for (size_t i = 0; i < n / 16; i++) {
// 加载16字节
uint8x16_t data = vld1q_u8(src_ptr + i * 16);
// 存储16字节
vst1q_u8(dest_ptr + i * 16, data);
}

// 处理剩余的字节
for (size_t i = (n / 16) * 16; i < n; i++) {
dest_ptr[i] = src_ptr[i];
}
}

int test_mem_copy()
{
int dma_fd = open(DMA_TYPE, O_RDWR);
if (dma_fd < 0) {
std::cerr << "文件描述符无效, 错误代码: " << errno << std::endl;
close(dma_fd);
return -1;
}

struct dma_heap_allocation_data buf_data;
/* alloc buffer */
memset(&buf_data, 0x0, sizeof(struct dma_heap_allocation_data));
buf_data.len = SIZE;
buf_data.fd_flags = O_CLOEXEC | O_RDWR;
int ret = ioctl(dma_fd, DMA_HEAP_IOCTL_ALLOC, &buf_data);
if (ret < 0) {
std::cerr << "RK_DMA_HEAP_ALLOC_BUFFER failed" << std::endl;
return ret;
}

/* mmap va */
int prot;
if (fcntl(buf_data.fd, F_GETFL) & O_RDWR)
prot = PROT_READ | PROT_WRITE;
else
prot = PROT_READ;

// std::cout << "DMA FD: " << dma_fd << ", SIZE: " << SIZE << std::endl;
void* dma_memory = mmap(nullptr, SIZE, prot, MAP_SHARED, buf_data.fd, 0);
if (dma_memory == MAP_FAILED)
{
std::cerr << "DMA内存映射失败, 错误代码: " << errno << std::endl;
close(dma_fd);
return -1;
}

int shm_fd = open("/dev/shm/test_shared_memory", O_RDWR | O_CREAT, 0666);
if (shm_fd < 0)
{
std::cerr << "无法打开 /dev/shm/test_shared_memory" << std::endl;
munmap(dma_memory, SIZE);
close(dma_fd);
return -1;
}

// std::cout << "SHM FD: " << shm_fd << ", SIZE: " << SIZE << std::endl;
ftruncate(shm_fd, SIZE);
void* shm_memory = mmap(nullptr, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
if (shm_memory == MAP_FAILED)
{
std::cerr << "SHM内存映射失败, 错误代码: " << errno << std::endl;
close(shm_fd);
munmap(dma_memory, SIZE);
close(dma_fd);
return -1;
}

/*
count = 10: 发现neon并没有比memcpy快
copy1[10]拷贝使用的时间1: 10.2568 毫秒
copy2[10]拷贝使用的时间2: 10.5191 毫秒
copy3[10]拷贝使用的时间3: 86.7563 毫秒
*/
int count = 10;
auto start = std::chrono::high_resolution_clock::now();
for(int i = 0; i < count; i++)
{
std::memcpy(shm_memory, dma_memory, SIZE);
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "copy1[" << count << "]" << "拷贝使用的时间1: " << duration.count()*1.0/count << " 毫秒" << std::endl;
start = std::chrono::high_resolution_clock::now();
for(int i = 0; i < count; i++)
{
char *ch = new char[SIZE];
std::memcpy(ch, dma_memory, SIZE);
std::memcpy(shm_memory, ch, SIZE);
delete[] ch;
}
end = std::chrono::high_resolution_clock::now();
duration = end - start;
std::cout << "copy2[" << count << "]" << "拷贝使用的时间2: " << duration.count()*1.0/count << " 毫秒" << std::endl;
start = std::chrono::high_resolution_clock::now();
for(int i = 0; i < count; i++)
{
neon_memcpy(shm_memory, dma_memory, SIZE);
}
end = std::chrono::high_resolution_clock::now();
duration = end - start;
std::cout << "copy3[" << count << "]" << "拷贝使用的时间3: " << duration.count()*1.0/count << " 毫秒" << std::endl;

munmap(shm_memory, SIZE);
munmap(dma_memory, SIZE);
close(shm_fd);
close(dma_fd);
close(buf_data.fd);
return 0;
}

#define COPY_DIRECT 1
#define COPY_CACHE 2
int test_mem_copy1(int flag)
{
int count = 100;
auto start = std::chrono::high_resolution_clock::now();
for(int i = 0; i < count; i++)
{
int dma_fd = open(DMA_TYPE, O_RDWR);
if (dma_fd < 0) {
std::cerr << "文件描述符无效, 错误代码: " << errno << std::endl;
close(dma_fd);
return -1;
}

struct dma_heap_allocation_data buf_data;
/* alloc buffer */
memset(&buf_data, 0x0, sizeof(struct dma_heap_allocation_data));
buf_data.len = SIZE;
buf_data.fd_flags = O_CLOEXEC | O_RDWR;
int ret = ioctl(dma_fd, DMA_HEAP_IOCTL_ALLOC, &buf_data);
if (ret < 0) {
std::cerr << "RK_DMA_HEAP_ALLOC_BUFFER failed" << std::endl;
return ret;
}

/* mmap va */
int prot;
if (fcntl(buf_data.fd, F_GETFL) & O_RDWR)
prot = PROT_READ | PROT_WRITE;
else
prot = PROT_READ;

// std::cout << "DMA FD: " << dma_fd << ", SIZE: " << SIZE << std::endl;
void* dma_memory = mmap(nullptr, SIZE, prot, MAP_SHARED, buf_data.fd, 0);
if (dma_memory == MAP_FAILED)
{
std::cerr << "DMA内存映射失败, 错误代码: " << errno << std::endl;
close(dma_fd);
return -1;
}

int shm_fd = open("/dev/shm/test_shared_memory", O_RDWR | O_CREAT, 0666);
if (shm_fd < 0)
{
std::cerr << "无法打开 /dev/shm/test_shared_memory" << std::endl;
munmap(dma_memory, SIZE);
close(dma_fd);
return -1;
}

// std::cout << "SHM FD: " << shm_fd << ", SIZE: " << SIZE << std::endl;
ftruncate(shm_fd, SIZE);
void* shm_memory = mmap(nullptr, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
if (shm_memory == MAP_FAILED)
{
std::cerr << "SHM内存映射失败, 错误代码: " << errno << std::endl;
close(shm_fd);
munmap(dma_memory, SIZE);
close(dma_fd);
return -1;
}

if(flag == COPY_DIRECT)
{
std::memcpy(shm_memory, dma_memory, SIZE);
}
else
{
char *ch = new char[SIZE];
std::memcpy(ch, dma_memory, SIZE);
std::memcpy(shm_memory, ch, SIZE);
delete[] ch;
}

munmap(shm_memory, SIZE);
munmap(dma_memory, SIZE);
close(shm_fd);
close(dma_fd);
close(buf_data.fd);
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << (flag == COPY_DIRECT ? "copy1" : "copy2") << "[" << count << "]" << "拷贝使用的时间: " << duration.count()*1.0/count << " 毫秒" << std::endl;

return 0;
}

int main() {
// int dma_fd;
// void* dma_memory;
// int r = dma_buf_alloc(DMA_HEAP_DMA32_UNCACHED_PATH, SIZE, &dma_fd, (void **)&dma_memory);
// if (r < 0)
// {
// printf("alloc data %s failed!\n", DMA_HEAP_DMA32_UNCACHED_PATH);
// return -1;
// }
// else
// {
// printf("alloc data %s success!\n", DMA_HEAP_DMA32_UNCACHED_PATH);
// }

test_mem_copy();
// test_mem_copy1(COPY_DIRECT);
// test_mem_copy1(COPY_CACHE);

return 0;
}
    (1-1/1)