什么是fuse

Fuse 是一个文件系统,与ext4的区别在于,Fuse有一部分逻辑在用户态,这使得我们可以通过fuse实现很多自定义的逻辑

image.png

VFS

Virtual File System, 为所有的文件系统提供了统一的接口,VFS中定义了几个重要的struct, dentry, inode, super_block

Superblock

代表了整个文件系统本身,超级块保存了文件系统设定的文件块大小,超级块操作函数,文件系统的所有inode也都要链接到supernode的链表头

Dentry

每个文件都有一个dentry, 这个dentry链接到上级目录的dentry,为了加快对dentry的查找,内核使用了hash表来缓存dentry,称为dentry cache

  1. struct dentry {
  2. unsigned int d_flags; //用来判断改dentry是否是mountpoint
  3. ...
  4. }
  5. static inline bool d_mountpoint(const struct dentry *dentry)
  6. {
  7. return dentry->d_flags & DCACHE_MOUNTED;
  8. }

Inode

inode 代表一个文件,包含各种元数据,以及对文件的读写函数,文件的读写缓存等信息,一个真实的文件可以有多个dentry,但只有一个inode, inode中有几个重要的结构体

struct inode {
    struct address_space *i_mapping;        // 缓存文件内容,对文件的读写操作首先在i_mapping包含的缓存里寻找文件内容
    const struct file_operations    *i_fop;
    union {
        struct pipe_inode_info    *i_pipe;
        struct block_device    *i_bdev;
        struct cdev        *i_cdev;
        char            *i_link;
        unsigned        i_dir_seq;
    };  // 代表文件系统背后的块设备,也有可能是其他类型
}

File

文件对象的作用是描述进场和文件交互的关系,进程打开一个文件,内核就会动态创建一个文件对象

// fs.h
struct file {
    loff_t            f_pos; //表示进程对文件操作的位置,例如对文件读取10个字节,f_ops就指到11个字节位置
}

文件系统的注册

当注册新的文件系统时,会调用register_filesystem函数,这个函数会检查是否已经存在相同的名字,如果不存在则将该文件系统挂到全局的filesystem链表上

/**
 *    register_filesystem - register a new filesystem
 *    @fs: the file system structure
 *
 *    Adds the file system passed to the list of file systems the kernel
 *    is aware of for mount and other syscalls. Returns 0 on success,
 *    or a negative errno code on an error.
 *
 *    The &struct file_system_type that is passed is linked into the kernel 
 *    structures and must not be freed until the file system has been
 *    unregistered.
 */

int register_filesystem(struct file_system_type * fs)
{
    int res = 0;
    struct file_system_type ** p;

    BUG_ON(strchr(fs->name, '.'));
    if (fs->next)
        return -EBUSY;
    write_lock(&file_systems_lock);
    p = find_filesystem(fs->name, strlen(fs->name));
    if (*p)
        res = -EBUSY;
    else
        *p = fs;
    write_unlock(&file_systems_lock);
    return res;
}

EXPORT_SYMBOL(register_filesystem);

fuse 模块初始化

static int __init fuse_init(void)
{
    int res;

    printk(KERN_INFO "fuse init (API version %i.%i)\n",
           FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);

    INIT_LIST_HEAD(&fuse_conn_list);
    res = fuse_fs_init();
    if (res)
        goto err;

    res = fuse_dev_init();
    if (res)
        goto err_fs_cleanup;

    res = fuse_sysfs_init();
    if (res)
        goto err_dev_cleanup;

    res = fuse_ctl_init();
    if (res)
        goto err_sysfs_cleanup;

    sanitize_global_limit(&max_user_bgreq);
    sanitize_global_limit(&max_user_congthresh);

    return 0;

 err_sysfs_cleanup:
    fuse_sysfs_cleanup();
 err_dev_cleanup:
    fuse_dev_cleanup();
 err_fs_cleanup:
    fuse_fs_cleanup();
 err:
    return res;
}

fuse_fs_init 初始化注册fuse文件系统
fuse_sysfs_init 注册到 /sys/fs/fuse/connections sysfs,这是一个kobject

static int fuse_sysfs_init(void)
{
    int err;

    fuse_kobj = kobject_create_and_add("fuse", fs_kobj);
    if (!fuse_kobj) {
        err = -ENOMEM;
        goto out_err;
    }

    err = sysfs_create_mount_point(fuse_kobj, "connections");
    if (err)
        goto out_fuse_unregister;

    return 0;

 out_fuse_unregister:
    kobject_put(fuse_kobj);
 out_err:
    return err;
}

kobject_create_and_add(“fuse”, fs_kobj) 就是将 fuse加到 /sys/fs sysfs文件系统
kobject_put 为回收内存

最后是fuse_ctl_init,注册了fusectl 文件系统

内核 用户态通讯

/dev/fuse 的诞生

static struct miscdevice fuse_miscdevice = {
    .minor = FUSE_MINOR,
    .name  = "fuse",
    .fops = &fuse_dev_operations,
};
const struct file_operations fuse_dev_operations = {
    .owner        = THIS_MODULE,
    .open        = fuse_dev_open,
    .llseek        = no_llseek,
    .read_iter    = fuse_dev_read,
    .splice_read    = fuse_dev_splice_read,
    .write_iter    = fuse_dev_write,
    .splice_write    = fuse_dev_splice_write,
    .poll        = fuse_dev_poll,
    .release    = fuse_dev_release,
    .fasync        = fuse_dev_fasync,
    .unlocked_ioctl = fuse_dev_ioctl,
    .compat_ioctl   = fuse_dev_ioctl,
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);

用户态 初始化过程

fuse_session_new -> fuse_session_mount -> fuse_kern_mount
static struct dentry *fuse_mount(struct file_system_type *fs_type,
               int flags, const char *dev_name,
               void *raw_data)
{
    return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
}

fuse在mount的时候调用的是mount_nodev, mount_nodev: mount a filesystem that is not backed by a device

fuse_fill_super

在fuse_mount的时候就会调用 fuse_fill_super 来填充fuse的super_block
fuse_fill_super的过程除了填充superblock,还会初始化fuse_connection, fusedev, 初始化bdi
这时候fuse ko会给fuse userspace发送第一个请求 init 请求,请求会包含当前fuse所支持的特性,用户态可以在这个请求中告诉ko,哪些特性是不需要的

fuse_dev_read

用户态从kernl读取请求,都会从这个/dev/fuse设备中去读取,(从perf数据看来,目前fuse的瓶颈很大一块在这个地方,这里有大量的spinlock,如果配置的idle线程数较多,这里在spinlock上等待的时间越多)

文件操作

file_operations

内核中定义的file_operations 可以看做是一个代理,所有vfs过来的请求首先会执行ko里定义的op

static const struct file_operations fuse_file_operations = {
    .llseek        = fuse_file_llseek,
    .read_iter    = fuse_file_read_iter,
    .write_iter    = fuse_file_write_iter,
    .mmap        = fuse_file_mmap,
    .open        = fuse_open,
    .flush        = fuse_flush,
    .release    = fuse_release,
    .fsync        = fuse_fsync,
    .lock        = fuse_file_lock,
    .flock        = fuse_file_flock,
    .splice_read    = generic_file_splice_read,
    .unlocked_ioctl    = fuse_file_ioctl,
    .compat_ioctl    = fuse_file_compat_ioctl,
    .poll        = fuse_file_poll,
    .fallocate    = fuse_file_fallocate,
};

以open为例

int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
    struct fuse_conn *fc = get_fuse_conn(inode);
    int err;
    bool lock_inode = (file->f_flags & O_TRUNC) &&
              fc->atomic_o_trunc &&
              fc->writeback_cache;
    ...

kernel中的op会从inode结构体中获取fuse_conn, 然后将请求写到conn的pending list上,并唤醒等待在list上的处理请求,随后睡眠,等待reply的返回,(这里就会出现,如果用户态进程在接收到请求后异常退出,没有reply,那么将会把文件系统hang住),所以这里增加了一个fusectl控制文件,当fuse进程异常退出,被重新拉起后,首先要将 inflight的请求flush掉,并且最重要,要唤醒等在reply上的线程,当然,就和你想的那样,这些flush掉的请求都会返回eio,不过在4.19内核上,我们支持了请求重放的能力,这使得异常恢复变得更加顺滑

热升级是如何实现的

前面提到了 fuse_dev_read函数,通过/dev/fuse 设备读取 fuse request,所以我们只需要将这个设备open出来的fd保存下来(实际上就是新增这个fd的引用,防止被close),那么我们就可以通过 uds把这个fd send到新的进程中,新的进程就可以跳过mount过程,直接从这个设备读取fuse request