首页 > 其他分享 >kernel——文件系统

kernel——文件系统

时间:2022-11-24 15:00:25浏览次数:42  
标签:kernel 00 fs struct 文件系统 ................ ff

设备端文件系统的格式

所有文件系统都使用如下格式为基础

如minix,适用于小容量环境

如ext2,适用于大容量环境,于是进行了扩展

具体分析设备上的文件系统

以最简单的minix为例

格式化

root@ubuntu:~# mkfs.minix /dev/sdb
704 inodes
2048 blocks
Firstdatazone=26 (26)
Zonesize=1024
Maxsize=268966912

说明,一共创建了 2048个逻辑块,0-25个逻辑块用于记录引导,super_block,i-bmap,d-bmap,inode[704],一个逻辑块大小为1KB,单个文件最大大小268966912B。

dump出磁盘的内容

root@ubuntu:~# hexdump -C /dev/sdb > wlt/minix.origin
root@ubuntu:~# cat wlt/minix.origin
00000000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00000400  c0 02 00 08 01 00 01 00  1a 00 00 00 00 1c 08 10  |................|
00000410  8f 13 01 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00000420  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00000800  03 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00000810  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00000850  00 00 00 00 00 00 00 00  fe ff ff ff ff ff ff ff  |................|
00000860  ff ff ff ff ff ff ff ff  ff ff ff ff ff ff ff ff  |................|
*
00000c00  03 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00000c10  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00000cf0  00 00 00 00 00 00 00 00  00 00 00 00 80 ff ff ff  |................|
00000d00  ff ff ff ff ff ff ff ff  ff ff ff ff ff ff ff ff  |................|
*
00001000  ed 41 00 00 40 00 00 00  30 c5 75 63 00 02 1a 00  |.A..@...0.uc....|
00001010  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00006800  01 00 2e 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006810  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006820  01 00 2e 2e 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006830  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006840  00 00 2e 62 61 64 62 6c  6f 63 6b 73 00 00 00 00  |...badblocks....|
00006850  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00200000

文件系统必须提供 xxx_inode , xxx_super_block, xxx_dir_entry,如minix,通过下面的信息,就能解读 以minix格式化的磁盘数据。
include/uapi/linux/minix_fs.h

  /*
   * This is the original minix inode layout on disk.
   * Note the 8-bit gid and atime and ctime.
   */
  struct minix_inode {
      __u16 i_mode;     // 文件类型和访问权限
      __u16 i_uid;      // user id
      __u32 i_size;     // 大小
      __u32 i_time;     
      __u8  i_gid;      // group id
      __u8  i_nlinks;   // 硬链接数量
      __u16 i_zone[9];  // 相关的data zone
  };


  /*
   * minix super-block data on disk
   */
  struct minix_super_block {
      __u16 s_ninodes;           // inode的数量
      __u16 s_nzones;            // 逻辑块数量
      __u16 s_imap_blocks;       // inode bmap占用的逻辑块数量
      __u16 s_zmap_blocks;       // data bmap占用的逻辑块数量
      __u16 s_firstdatazone;     // 第一个data zone 的逻辑块编号
      __u16 s_log_zone_size;     // 一个data zone 的大小,2^n
      __u32 s_max_size;          // 支持的最大文件大小
      __u16 s_magic;
      __u16 s_state;
      __u32 s_zones;
  };

  struct minix_dir_entry {
      __u16 inode;
      char name[0];
  };

创建目录和文件后,文件系统的改变

root@ubuntu:~# mount /dev/sdb  /mnt/
root@ubuntu:~# cd /mnt/
root@ubuntu:/mnt# mkdir dir0
root@ubuntu:/mnt# echo "world" > file0
root@ubuntu:/mnt# cd dir0/
root@ubuntu:/mnt/dir0# echo "hello" > file1
root@ubuntu:/mnt# cd /root/
root@ubuntu:~# umount /mnt
root@ubuntu:~# hexdump -C /dev/sdb > wlt/minix.data

文件系统

root@ubuntu:~# cat wlt/minix.data
00000000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00000400  c0 02 00 08 01 00 01 00  1a 00 00 00 00 1c 08 10  |................|
00000410  8f 13 01 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00000420  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00000800  1f 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00000810  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00000850  00 00 00 00 00 00 00 00  fe ff ff ff ff ff ff ff  |................|
00000860  ff ff ff ff ff ff ff ff  ff ff ff ff ff ff ff ff  |................|
*
00000c00  1f 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00000c10  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00000cf0  00 00 00 00 00 00 00 00  00 00 00 00 80 ff ff ff  |................|
00000d00  ff ff ff ff ff ff ff ff  ff ff ff ff ff ff ff ff  |................|
*
00001000  ed 41 00 00 80 00 00 00  f3 df 75 63 00 03 1a 00  |.A........uc....|
00001010  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00001020  ed 41 00 00 60 00 00 00  f9 df 75 63 00 02 1b 00  |.A..`.....uc....|
00001030  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00001040  a4 81 00 00 06 00 00 00  08 e0 75 63 00 01 1d 00  |..........uc....|
00001050  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00001060  a4 81 00 00 06 00 00 00  ff df 75 63 00 01 1c 00  |..........uc....|
00001070  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00006800  01 00 2e 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006810  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006820  01 00 2e 2e 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006830  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006840  02 00 64 69 72 30 00 00  00 00 00 00 00 00 00 00  |..dir0..........|
00006850  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006860  03 00 66 69 6c 65 30 00  00 00 00 00 00 00 00 00  |..file0.........|
00006870  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00006c00  02 00 2e 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006c10  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006c20  01 00 2e 2e 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006c30  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
00006c40  04 00 66 69 6c 65 31 00  00 00 00 00 00 00 00 00  |..file1.........|
00006c50  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00007000  68 65 6c 6c 6f 0a 00 00  00 00 00 00 00 00 00 00  |hello...........|
00007010  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00007400  77 6f 72 6c 64 0a 00 00  00 00 00 00 00 00 00 00  |world...........|
00007410  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
*
00200000

分析如下

VFS


Linux中一切皆文件,关键就在VFS,让应用程序的可以使用一套文件操作接口访问kernel。
为了实现这样目标,具体的文件系统需要实现各自的回调方法,VFS会迟绑定这些回调。
如 file->fops
比如 minix 的

  const struct file_operations minix_file_operations = {
      .llseek     = generic_file_llseek,
      .read_iter  = generic_file_read_iter,
      .write_iter = generic_file_write_iter,
      .mmap       = generic_file_mmap,
      .fsync      = generic_file_fsync,
      .splice_read    = generic_file_splice_read,
  };

而 字符设备 的

  const struct file_operations def_chr_fops = {
      .open = chrdev_open,
      .llseek = noop_llseek,
  };

需要迟绑定的核心数据结构包括
VFS super_block
VFS inode
VFS file
VFS dentry
此外VFS还有一些重要的类型
file_system_type
vfsmount

文件系统的注册

文件系统模块init时,进行注册,exit时,删除。
以minix为例

  static struct file_system_type minix_fs_type = {
      .owner      = THIS_MODULE,
      .name       = "minix",
      .mount      = minix_mount,
      .kill_sb    = kill_block_super,
      .fs_flags   = FS_REQUIRES_DEV,
  };
  MODULE_ALIAS_FS("minix");

  static int __init init_minix_fs(void)
  {
      int err = init_inodecache();
      if (err)
          goto out1;
      err = register_filesystem(&minix_fs_type);
      if (err)
          goto out;
      return 0;
  out:
      destroy_inodecache();
  out1:
      return err;
  }

  static void __exit exit_minix_fs(void)
  {
          unregister_filesystem(&minix_fs_type);
      destroy_inodecache();
  }

  module_init(init_minix_fs)
  module_exit(exit_minix_fs)

file_system_type 表示文件系统

  struct file_system_type {
      const char *name;   // 文件系统的名称
      int fs_flags;       // 支持的存储类型
  #define FS_REQUIRES_DEV     1    /* 硬盘 */
  #define FS_BINARY_MOUNTDATA 2    /* 网络文件系统 */
  #define FS_HAS_SUBTYPE      4
  #define FS_USERNS_MOUNT     8   /* Can be mounted by userns root */
  #define FS_DISALLOW_NOTIFY_PERM 16  /* Disable fanotify permission events */
  #define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
  #define FS_RENAME_DOES_D_MOVE   32768   /* FS will handle d_move() during rename() internally. */
      int (*init_fs_context)(struct fs_context *);
      const struct fs_parameter_spec *parameters;
      struct dentry *(*mount) (struct file_system_type *, int,
                 const char *, void *);         // 挂载文件系统的方法
      void (*kill_sb) (struct super_block *);   // 卸载方法
      struct module *owner;
      struct file_system_type * next;           // kernel已注册的文件系统将以链表形式组织
      struct hlist_head fs_supers;

      struct lock_class_key s_lock_key;
      struct lock_class_key s_umount_key;
      struct lock_class_key s_vfs_rename_key;
      struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

      struct lock_class_key i_lock_key;
      struct lock_class_key i_mutex_key;
      struct lock_class_key invalidate_lock_key;
      struct lock_class_key i_mutex_dir_key;
  };

具体看注册操作

  static struct file_system_type **find_filesystem(const char *name, unsigned len)
  {
      struct file_system_type **p;
      for (p = &file_systems; *p; p = &(*p)->next)
          if (strncmp((*p)->name, name, len) == 0 &&
              !(*p)->name[len])
              break;
      return p;
  }

  int register_filesystem(struct file_system_type * fs)
  {
      int res = 0;
      struct file_system_type ** p;

      if (fs->parameters &&
          !fs_validate_description(fs->name, fs->parameters))
          return -EINVAL;

      BUG_ON(strchr(fs->name, '.'));
      if (fs->next)
          return -EBUSY;
      write_lock(&file_systems_lock);
      p = find_filesystem(fs->name, strlen(fs->name));   // 查找是否已注册,否则返回尾节点指针的指针 
      if (*p)
          res = -EBUSY;
      else
          *p = fs;
      write_unlock(&file_systems_lock);
      return res;
  }

所以注册后系统会生成如下链表

可以查看系统已注册的文件系统

root@ubuntu:~/wlt/build/linux-5.16.2# cat /proc/filesystems
nodev   sysfs
nodev   rootfs
nodev   ramfs
nodev   bdev
nodev   proc
nodev   cpuset

设备挂载

为了理解,需要了解的类型

  struct super_block {
      struct list_head    s_list;     /* 所有已挂载的文件系统连成一个表 */
      dev_t           s_dev;      /* search index; _not_ kdev_t */
      unsigned char       s_blocksize_bits;
      unsigned long       s_blocksize;
      loff_t          s_maxbytes; /* Max file size */
      struct file_system_type *s_type;      // 指向挂载的文件系统模块
      const struct super_operations   *s_op;  // 回调接口
      const struct dquot_operations   *dq_op;
      const struct quotactl_ops   *s_qcop;
      const struct export_operations *s_export_op;
      unsigned long       s_flags;
      unsigned long       s_iflags;   /* internal SB_I_* flags */
      unsigned long       s_magic;
      struct dentry       *s_root;      // 文件系统根目录
      struct rw_semaphore s_umount;
      int         s_count;
      ...
  };

  struct dentry {
      ...
      struct hlist_bl_node d_hash;    /* lookup hash list */
      struct dentry *d_parent;    /* parent directory */
      struct qstr d_name;         /* 文件名 */
      struct inode *d_inode;      /* Where the name belongs to - NULL is
                       * negative */
      unsigned char d_iname[DNAME_INLINE_LEN];    /* small names */

      const struct dentry_operations *d_op;
      struct super_block *d_sb;   /* The root of the dentry tree */
      ...
  } __randomize_layout;

  struct inode {
      umode_t         i_mode;
      unsigned short      i_opflags;
      kuid_t          i_uid;
      kgid_t          i_gid;
      unsigned int        i_flags;

      const struct inode_operations   *i_op;  // 回调接口
      struct super_block  *i_sb;
      struct address_space    *i_mapping;
      ....
  };

可见以上类型是 VFS 对 文件系统的抽象,kernel在挂载设备时,通过读取设备上的文件系统,以创建以上类型的对象。而且这些对象互相连接。

首先创建 mount vfsmount dentry inode super_block 的关系

上面执行完后,会构成如下结构


综上,通过 mount_hashtable 就可以找到 设备的super_block,再通过super_block就能加载设备根目录的inode,从而实现访问挂载的设备。
mount_hashtable的键值是根据 struct path 计算出的

  struct path {
      struct vfsmount *mnt;   // 包含挂载的super block
      struct dentry *dentry;  // 挂载点的目录
  } __randomize_layout;

因为在确定 dentry 对应的 inode 时,只通过 dentry是无法确定的,因为dentry上可能挂载了设备,所以还需要 mnt

如果挂载了子设备则会形成如下结构

文件打开和创建

相关对象关系

读文件

设计介绍

注意 address_space,因为 数据块是不连续存储,而对上层读写需要数据的连续,所以定义 地址空间,用于实现不连续的数据连续读写,address_space使用 radix tree 管理page

代码分析

设备文件

设备文件在 /dev目录下,包括 字符设备,块设备,

yangxr@vexpress:/ # ls /dev/ -l
total 0
crw-rw----    1 0        0           5,   1 Jan  1 00:00 console
crw-rw----    1 0        0          10, 127 Jan  1 00:00 cpu_dma_latency
crw-rw----    1 0        0           1,   7 Jan  1 00:00 full
crw-rw----    1 0        0          10, 183 Jan  1 00:00 hwrng
drwxr-xr-x    2 0        0               80 Jan  1 00:00 input
crw-rw----    1 0        0           1,  11 Jan  1 00:00 kmsg
crw-rw----    1 0        0           1,   1 Jan  1 00:00 mem
brw-rw----    1 0        0         179,   0 Jan  1 00:00 mmcblk0
crw-rw----    1 0        0          90,   0 Jan  1 00:00 mtd0

再看/dev目录上挂载的文件系统,是 tmpfs,

yangxr@vexpress:/ # mount
192.168.5.129:/root/wlt/rootfs on / type nfs (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=192.168.5.129,mountvers=1,mountproto=tcp,local_lock=all,addr=192.168.5.129)
proc on /proc type proc (rw,relatime)
tmpfs on /tmp type tmpfs (rw,relatime)
sysfs on /sys type sysfs (rw,relatime)
var on /dev type tmpfs (rw,relatime)
devpts on /dev/pts type devpts (rw,relatime,mode=600,ptmxmode=000)

而 tmpfs是基于内存的

yangxr@vexpress:/ # cat /proc/filesystems
nodev   sysfs
nodev   tmpfs
nodev   bdev
nodev   proc
nodev   cgroup
nodev   cgroup2
nodev   cpuset
nodev   devtmpfs
nodev   tracefs
nodev   sockfs
nodev   pipefs
nodev   ramfs
nodev   rpc_pipefs
nodev   devpts
        ext3
        ext4
        ext2
        cramfs
        squashfs
        vfat
nodev   nfs
nodev   jffs2
nodev   9p
nodev   ubifs

所以设备文件和普通的不同:

  • 设备文件的inode不记录到 存储设备上,都是kernel运行后基于内存创建的
  • 设备文件最关键的信息是设备号

设备文件的创建时机:

  • kernel初始化时预先创建
  • udev/mdev 根据 /sys/class 创建
  • 用户调用mknod

tmpfs文件系统

  // tmpfs 没有实现 mount 操作,mount用于实现读取设备的super_block,构建内存的super_block,mount对象
  static struct file_system_type shmem_fs_type = {
      .owner      = THIS_MODULE,
      .name       = "tmpfs",
      .init_fs_context = shmem_init_fs_context,
  #ifdef CONFIG_TMPFS
      .parameters = shmem_fs_parameters,
  #endif
      .kill_sb    = kill_litter_super,
      .fs_flags   = FS_USERNS_MOUNT,
  };

  int __init shmem_init(void)
  {
      int error;

      shmem_init_inodecache();

      error = register_filesystem(&shmem_fs_type);
      if (error) {
          pr_err("Could not register tmpfs\n");
          goto out2;
      }

      shm_mnt = kern_mount(&shmem_fs_type);
      if (IS_ERR(shm_mnt)) {
          error = PTR_ERR(shm_mnt);
          pr_err("Could not kern_mount tmpfs\n");
          goto out1;
      }

  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
      if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
          SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
      else
          shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
  #endif
      return 0;

  out1:
      unregister_filesystem(&shmem_fs_type);
  out2:
      shmem_destroy_inodecache();
      shm_mnt = ERR_PTR(error);
      return error;
  }

设备文件的创建

设备文件的创建使用mknod,主要工作是创建inode,记录设备号,绑定默认ops

设备文件的打开

根文件系统

根文件系统是 kernel 切换到用户进程必须的文件系统,有特定的目录结构

root@ubuntu:~/wlt/build/linux-5.16.2# ls ../../rootfs
bin  dev  etc  lib  linuxrc  proc  root  sbin  sys  tmp  usr  var

必须应用程序,如 linuxrc -> bin/busybox

和一些初始化系统的配置如

root@ubuntu:~/wlt/build/linux-5.16.2# ls ../../rootfs/etc/
fstab  init.d  inittab  profile

fstab 用于指导初始化时挂载 的文件系统
inittab:系统启动,关闭等特殊动作时运行的脚本或程序
init.d:目录,下面是各种脚本
profile : 界面格式

根文件系统的挂载
三种方式:
1)在设备上创建,需要给引导传参,以指定根文件系统的设备和文件系统类型
2)initrd,将根文件系统相关数据构建成模块,会进行两次挂载。
3)initramfs,链接内核时,会创建一个特殊的段,用于存储根文件系统的数据,最终成为内核二进制的一部分

rootfs

根文件系统的挂载是需要系统存在一个目录,而这个目录又属于一个文件系统,这个文件系统比根文件系统还早被创建,且不依赖与任何其他文件系统,被称为 rootfs。
rootfs是基于 tmpfs实现,在kernel初始化阶段创建。

  asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
  {

      ...
      vfs_caches_init();   // 创建rootfs和根目录
      ...

      arch_call_rest_init();  // 挂载根文件目录

  }

代码分析

rootfs 挂载后

rootfs挂载后会创建如下结构,再将进程的工作目录和当前目录的path设置为rootfs的根目录。

挂载根文件系统

我使用的启动参数为

 setenv bootargs 'root=/dev/nfs rw noinitrd  \
      nfsroot=192.168.5.129:/root/wlt/rootfs \
      ip=192.168.5.127 \
      init=/linuxrc console=ttyAMA0';

根据启动参数挂载文件系统到rootfs的root目录,再修改挂载点到rootfs的/目录,最后设置 task_struct->fs,也就是进程的根目录为 nfs的 根目录

initrd

CPIO格式

修改内核,使其支持压缩格式

创建initrd镜像

find . | cpio -o -H newc | gzip -c > initrd.tgz

uboot启动参数

  #define CONFIG_BOOTCOMMAND \
      "tftp 0x60010000 uImage; tftp 0x60500000 vexpress-v2p-ca9.dtb; \
      tftp 0x62000000 initrd.tgz; \
      setenv bootargs 'initrd=0x62000000,4M root=/dev/ram0 rw \
      rdinit=/linuxrc console=ttyAMA0'; \
      bootm 0x60010000 - 0x60500000;"

启动后,查看mount情况,证明根文件系统为rootfs

yangxr@vexpress:/ # mount
rootfs on / type rootfs (rw)
proc on /proc type proc (rw,relatime)
tmpfs on /tmp type tmpfs (rw,relatime)
sysfs on /sys type sysfs (rw,relatime)
var on /dev type tmpfs (rw,relatime)
devpts on /dev/pts type devpts (rw,relatime,mode=600,ptmxmode=000)

initramfs

kernel配置 initramfs的目录

kbuild会将目录打包为cpio格式,链接到Image中。
需要注意 这会导致 uImage 很大,通常需要修改 uboot的对 image的最大限制。
使用 initramfs 时,默认的 init 程序为 init,所以需要修改程序名称,可以用 initrd指定

代码分析

使用CPIO格式时,kenel会解压并展开内存中的数据,到rootfs文件系统,以rootfs文件系统为根文件系统,并运行 /init 程序

文件系统自动挂载

运行init程序后,init会根据/etc/fstab自动挂载文件系统

proc           /proc      proc    defaults   0     0
tmpfs          /tmp       tmpfs   defaults   0     0
sysfs          /sys       sysfs   defaults   0     0
var            /dev       tmpfs   defaults   0     0
ramfs          /dev       tmpfs   defaults   0     0

tmpfs 和 ramfs都是基于内存的文件系统,差别是,tmpfs是ramfs的改进版,当文件删除后tmpfs会释放内存,当内存不够时,tmpfs会交换文件到flash

标签:kernel,00,fs,struct,文件系统,................,ff
From: https://www.cnblogs.com/yangxinrui/p/16899698.html

相关文章