pmdk -- libpmemlog 介绍

标签：plp -- libpmemlog next pmemlog pmdk report buf size

文章目录

1. libpmemlog 应用背景
2. libpmemlog 使用方式

2.1 基本接口
2.2 接口使用

3. Libpmemlog 性能

3.1 write sys call 性能
3.2 libpmemlog 性能

1. libpmemlog 应用背景

本文介绍的是英特尔傲腾持久化内存 pmdk中的一个持久化日志的库。

我们正常系统中会将日志形成一个LOG文件保存到磁盘中，这个过程在PMEM中也是类似的。尤其是基于PMEM构建自己的高性能应用时，使用传统的文件系统写日志的接口会降低PMEM本身的日志写入速度，从而间接影响系统性能，这方面后文会有两者的性能对比测试。

所以基于PMEM的系统开发，日志也建议使用PMEM的日志库来完成日志信息的持久化。

2. libpmemlog 使用方式

社区提供了对linux和windows系统的不同使用方式，这里也会有一些基本的区别。

2.1 基本接口

pmemlog_create 在PMEM上创建一个pmempool，用于存储数据。

Pmempool 是pmdk操作PMEM空间的一种形态，需要先从PMEM上划分一定的存储空间pmempool，然后通过类似文件的接口写入/删除数据。

PMEMlogpool *pmemlog_create(const char *path, size_t poolsize, mode_t mode);

包括指定。pmempool 的存储路径，pool大小，访问权限。

该接口底层还是会使用pmemobj 来进行创建：

PMEMlogpool *
pmemlog_create(const char *path, size_t poolsize, mode_t mode)
{
  return (PMEMlogpool *)pmemobj_create(path, LAYOUT_NAME,
        poolsize, mode);
}

pmemlog_open 打开pmempool，打开成功并返回pool文件句柄。

PMEMlogpool *pmemlog_open(const char *path);

只需要指定打开的pool文件路径即可

底层仍然会使用pmemobj相关接口创建，能够保证创建过程的一致性和原子性。

PMEMlogpool *
pmemlog_open(const char *path)
{
  return (PMEMlogpool *)pmemobj_open(path, LAYOUT_NAME);
}

pmemlog_append 向打开的pmempool中写入数据，追加方式，类似write

int pmemlog_append(PMEMlogpool *plp, const void *buf, size_t count);

指定写入的pool的 pmempool 对象 plp，要写入的数据buf, 以及写入大小count

当然写入的过程仍然是通过pmemobj相关的事务接口保证写入的原子性：

int
pmemlog_append(PMEMlogpool *plp, const void *buf, size_t count)
{
  PMEMobjpool *pop = (PMEMobjpool *)plp;
  PMEMoid baseoid = pmemobj_root(pop, sizeof(struct base));
  struct base *bp = pmemobj_direct(baseoid);

  /* set the return point */
  jmp_buf env;
  if (setjmp(env)) {
    /* end the transaction */
    (void) pmemobj_tx_end();
    return 1;
  }

  /* begin a transaction, also acquiring the write lock for the log */
  if (pmemobj_tx_begin(pop, env, TX_PARAM_RWLOCK, &bp->rwlock,
      TX_PARAM_NONE))
    return -1;

  /* allocate the new node to be inserted */
  PMEMoid log = pmemobj_tx_alloc(count + sizeof(struct log_hdr),
        LOG_TYPE);

  struct log *logp = pmemobj_direct(log);
  logp->hdr.size = count;
  memcpy(logp->data, buf, count);
  logp->hdr.next = OID_NULL;

  /* add the modified root object to the undo log */
  pmemobj_tx_add_range(baseoid, 0, sizeof(struct base));
  if (bp->tail.off == 0) {
    /* update head */
    bp->head = log;
  } else {
    /* add the modified tail entry to the undo log */
    pmemobj_tx_add_range(bp->tail, 0, sizeof(struct log));
    ((struct log *)pmemobj_direct(bp->tail))->hdr.next = log;
  }

  bp->tail = log; /* update tail */
  bp->bytes_written += count;

  pmemobj_tx_commit();
  (void) pmemobj_tx_end();
  return 0;
}

pmemlog_walk 从pmempool中遍历写入的数据

void pmemlog_walk(PMEMlogpool *plp, size_t chunksize,
  int (*process_chunk)(const void *buf, size_t len, void *arg),
  void *arg);

其中 plp是pmemlogpool的对象，通过process_chunk 回调函数来访问plp中起始到结束的所有数据，访问粒度是chunksize（代码中好像没啥用，根本没有用到这个变量）。

需要注意的是pmemlog_walk函数为了保证访问的原子性，会在处理过程中会加读锁，这个时候不能在process_chunk中再次调用pmemlog_append写入数据，可能会出现死锁。

void
pmemlog_walk(PMEMlogpool *plp, size_t chunksize,
  int (*process_chunk)(const void *buf, size_t len, void *arg), void *arg)
{
  PMEMobjpool *pop = (PMEMobjpool *)plp;
  // 创建一个obj root对象，如果已经有了，就直接返回。
  // 用于后续对pool中数据的读取。
  struct base *bp = pmemobj_direct(pmemobj_root(pop,
            sizeof(struct base)));

  //加读锁， 加失败则返回，说明有其他进程在访问
  if (pmemobj_rwlock_rdlock(pop, &bp->rwlock) != 0)
    return;

  /* process all chunks */
  // 返回root对象的头指针,依次访问所有的数据
  // 因为之前数据的存放也是这样追加链表尾的方式写入的。
  struct log *next = pmemobj_direct(bp->head);
  while (next != NULL) {
    (*process_chunk)(next->data, next->hdr.size, arg);
    next = pmemobj_direct(next->hdr.next);
  }

  // 读取完毕，释放读锁
  pmemobj_rwlock_unlock(pop, &bp->rwlock);
}

pmemlog_rewind 清理pmemlogpool中的所有数据，事务方式清理
pmemlog_close 关闭pmempool 的对象

2.2 接口使用

如下代码，功能是使用pmemlog相关接口写入持久化数据，并打印出来。

#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <libpmemlog.h>

/* size of the pmemlog pool -- 1 GB */
#define POOL_SIZE ((size_t)(1 << 30))

/*
 * printit -- log processing callback for use with pmemlog_walk()
 */
int
printit(const void *buf, size_t len, void *arg)
{
  fwrite(buf, len, 1, stdout);
  return 0;
}

int
main(int argc, char *argv[])
{
  const char path[] = "./pmem";
  PMEMlogpool *plp;
  size_t nbyte;
  char *str;

  /* create the pmemlog pool or open it if it already exists */
  plp = pmemlog_create(path, POOL_SIZE, 0666);
  if (plp == NULL)
    plp = pmemlog_open(path);

  if (plp == NULL) {
    perror(path);
    exit(1);
  }

  /* how many bytes does the log hold? */
  nbyte = pmemlog_nbyte(plp);
  printf("log holds %zu bytes\n", nbyte);

  /* append to the log... */
  str = "This is the first string appended\n";
  if (pmemlog_append(plp, str, strlen(str)) < 0) {
    perror("pmemlog_append");
    exit(1);
  }
  str = "This is the second string appended\n";
  if (pmemlog_append(plp, str, strlen(str)) < 0) {
    perror("pmemlog_append");
    exit(1);
  }

  /* print the log contents */
  printf("log contains:\n");
  pmemlog_walk(plp, 0, printit, NULL);

  pmemlog_close(plp);
}

编译:
g++ libpmemlog_walk.cc -o log_walk -lpmem -lpmemlog

输出如下：

$ ./log_walk
log holds 1073733632 bytes
log contains:
This is the first string appended
This is the second string appended

可以看到在当前PMEM 挂载的文件系统中有一个pmem pool的data 类型的文件，可以通过pmempool工具查看写入的内容：

$ ls -l pmem
-rw-rw-rw- 1 server server 1073741824 Jan 29 11:50 pmem
$ file pmem
pmem: data
$ pmempool dump pmem
00000000  54 68 69 73 20 69 73 20  74 68 65 20 66 69 72 73  |This is the firs|
00000010  74 20 73 74 72 69 6e 67  20 61 70 70 65 6e 64 65  |t string appende|
00000020  64 0a 54 68 69 73 20 69  73 20 74 68 65 20 73 65  |d.This is the se|
00000030  63 6f 6e 64 20 73 74 72  69 6e 67 20 61 70 70 65  |cond string appe|
00000040  6e 64 65 64 0a                                    |nded.           |

3. Libpmemlog 性能

文章前言有说过，当我们使用PMEM 硬件构建我们的存储系统时，日志记录方式的选择在PMEM上会一定程度得影响性能。使用传统的vfs的系统调用来写日志文件和 pmdk提供的libpmemlog 性能之间有多少差异，这一些差异需要在使用前来验证，以便指导我们完成系统日志方案的选型评估。

3.1 write sys call 性能

如下测试代码，通过write 写入20*64M 大小的数据。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

#include <sys/time.h>
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>

#include <libpmemlog.h>

#define POOL_SIZE ((uint64_t)(64 << 20))

using namespace std;

const uint64_t file_op = 20;
const uint64_t write_size=(4*1024*1024);
const uint64_t read_size=(1024*1024);
const uint64_t read_op=1000;

static uint64_t get_now_micros(){
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (tv.tv_sec) * 1000000 + tv.tv_usec;
}

int main(int argc, char *argv[]) {

  char *filename;
  // 文件名获取
  if (argc == 2) {
    filename = argv[1];
    printf("filename : %s \n", filename);
  } else {
    printf("args is not valid\n");
    return 1;
  }

  uint64_t ret,start_time,end_time,do_time,next_report_;
  void *buf;
  void *cp_buf;
  int buf_num=sizeof(char)*write_size;
  char namebuf[100];
  int fd;
  int i,j;
  bool error = false;

  buf=(void *)malloc(buf_num);
  cp_buf=(void *)malloc(buf_num);
  memset(buf,'1',buf_num); // 构建写入数据，单次写入大小为4M

  // 起始时间
  start_time=get_now_micros();
  next_report_=0;

  // 总共写入file_op 20次
  for (i = 0;i < file_op; i++) {
    snprintf(namebuf, sizeof(namebuf), "%s_vfs_%03d.log",filename, i);
    fd = open(namebuf, O_CREAT|O_RDWR|O_APPEND);
    if (fd == -1) {
      perror(namebuf);
      return 1;
    }

    // 每次写入4M
    for (j = 0;j < (POOL_SIZE / write_size); j++ ) {
      size_t x = write(fd, buf, write_size);
      if (x < 0) {
        printf("ret: %d, write failed! \n", x);
        error = true;
        break;
      }
    }
    // 保证数据落盘
    fsync(fd);

    if (error) {
      break;
    }

    close(fd);
    if (i >= next_report_) {
      if      (next_report_ < 1000)   next_report_ += 100;
      else if (next_report_ < 5000)   next_report_ += 500;
      else if (next_report_ < 10000)  next_report_ += 1000;
      else if (next_report_ < 50000)  next_report_ += 5000;
      else if (next_report_ < 100000) next_report_ += 10000;
      else if (next_report_ < 500000) next_report_ += 50000;
      else                            next_report_ += 100000;

      fprintf(stderr, "... finished %d ops%30s\r", i, "");
      fflush(stderr);
    }
  }

  // 结束时间
  end_time=get_now_micros();
  do_time=end_time-start_time;
  printf("file_size:%ld MB file_op:%ld write_size:%ld K\n",POOL_SIZE/1048576,file_op,write_size/1024);
  printf("%11.3f micros/op %6.1f MB/s\n",(1.0*do_time)/(file_op*((POOL_SIZE/write_size))),(POOL_SIZE*file_op/1048576.0)/(do_time*1e-6));

  free(buf);
  free(cp_buf);

  return 0;
}

编译：g++ -std=c++11 vfs_log.cc -o vfs_log

运行性能如下：

$ ./vfs_log test
filename : test
file_size:64 MB file_op:20 write_size:4096 K
   8976.303 micros/op  445.6 MB/s

3.2 libpmemlog 性能

同样，写入20*64M的数据，变更相关文件系统接口为pmemlog接口如下:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

#include <sys/time.h>
#include <unistd.h>
#include <sys/types.h>
#include <fcntl.h>

#include <libpmemlog.h>

#define POOL_SIZE ((uint64_t)(64 << 20))

using namespace std;

const char filename[]="./pmem";
const uint64_t file_op = 20;
const uint64_t write_size=(4*1024*1024);
const uint64_t read_size=(1024*1024);
const uint64_t read_op=1000;

static uint64_t get_now_micros(){
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (tv.tv_sec) * 1000000 + tv.tv_usec;
}

int main(int argc, char **argv)
{

    uint64_t ret,start_time,end_time,do_time,next_report_;
    void *buf;
    void *cp_buf;
    int buf_num=sizeof(char)*write_size;
    buf=(void *)malloc(buf_num);
    cp_buf=(void *)malloc(buf_num);
    memset(buf,'1',buf_num);

    PMEMlogpool *plp;

    char namebuf[100];

    start_time=get_now_micros();
    next_report_=0;
    int i,j;
    bool error_flag=false;
    for(i=0;i<file_op;i++){
        snprintf(namebuf,sizeof(namebuf),"%sceshi%04d.pool",filename,i);
        plp = pmemlog_create(namebuf, POOL_SIZE, 0666);

        if (plp == NULL)
            plp = pmemlog_open(namebuf);

        if (plp == NULL) {
            perror(namebuf);
            return 1;
        }
        pmemlog_rewind(plp);
        for(j=0;j<(POOL_SIZE/write_size);j++){
            ret=pmemlog_append(plp, buf, write_size);
            if(ret<0){
                printf("ret:%ld append falid!\n",ret);
                error_flag = true;
                break;
            }
        }
        if(error_flag){
            break;
        }

        pmemlog_close(plp);
        if (i >= next_report_) {
        if      (next_report_ < 1000)   next_report_ += 100;
        else if (next_report_ < 5000)   next_report_ += 500;
        else if (next_report_ < 10000)  next_report_ += 1000;
        else if (next_report_ < 50000)  next_report_ += 5000;
        else if (next_report_ < 100000) next_report_ += 10000;
        else if (next_report_ < 500000) next_report_ += 50000;
        else                            next_report_ += 100000;
        fprintf(stderr, "... finished %d ops%30s\r", i, "");
        fflush(stderr);
        }
    }
    end_time=get_now_micros();
    do_time=end_time-start_time;
    printf("file_size:%ld MB file_op:%ld write_size:%ld K\n",POOL_SIZE/1048576,file_op,write_size/1024);
    printf("%11.3f micros/op %6.1f MB/s\n",(1.0*do_time)/(file_op*((POOL_SIZE/write_size))),(POOL_SIZE*file_op/1048576.0)/(do_time*1e-6));



    free(buf);
    free(cp_buf);
    return 0;

}

编译：

g++ libpmemlog_test.cc -o t2 -lpmemlog -lpmem -pthread

运行如下:

$ ./t2
file_size:64 MB file_op:20 write_size:4096 K
   4772.434 micros/op  838.1 MB/s

相比于vfs write的性能，延时上还是有很大的优势的（write需要走操作系统内核vfs接口）。

标签：plp,--,libpmemlog,next,pmemlog,pmdk,report,buf,size
From： https://blog.51cto.com/u_13456560/5823155