什么是fuse
Fuse 是一个文件系统,与ext4的区别在于,Fuse有一部分逻辑在用户态,这使得我们可以通过fuse实现很多自定义的逻辑
VFS
Virtual File System, 为所有的文件系统提供了统一的接口,VFS中定义了几个重要的struct, dentry, inode, super_block
Superblock
代表了整个文件系统本身,超级块保存了文件系统设定的文件块大小,超级块操作函数,文件系统的所有inode也都要链接到supernode的链表头
Dentry
每个文件都有一个dentry, 这个dentry链接到上级目录的dentry,为了加快对dentry的查找,内核使用了hash表来缓存dentry,称为dentry cache
struct dentry {
unsigned int d_flags; //用来判断改dentry是否是mountpoint
...
}
static inline bool d_mountpoint(const struct dentry *dentry)
{
return dentry->d_flags & DCACHE_MOUNTED;
}
Inode
inode 代表一个文件,包含各种元数据,以及对文件的读写函数,文件的读写缓存等信息,一个真实的文件可以有多个dentry,但只有一个inode, inode中有几个重要的结构体
struct inode {
struct address_space *i_mapping; // 缓存文件内容,对文件的读写操作首先在i_mapping包含的缓存里寻找文件内容
const struct file_operations *i_fop;
union {
struct pipe_inode_info *i_pipe;
struct block_device *i_bdev;
struct cdev *i_cdev;
char *i_link;
unsigned i_dir_seq;
}; // 代表文件系统背后的块设备,也有可能是其他类型
}
File
文件对象的作用是描述进场和文件交互的关系,进程打开一个文件,内核就会动态创建一个文件对象
// fs.h
struct file {
loff_t f_pos; //表示进程对文件操作的位置,例如对文件读取10个字节,f_ops就指到11个字节位置
}
文件系统的注册
当注册新的文件系统时,会调用register_filesystem函数,这个函数会检查是否已经存在相同的名字,如果不存在则将该文件系统挂到全局的filesystem链表上
/**
* register_filesystem - register a new filesystem
* @fs: the file system structure
*
* Adds the file system passed to the list of file systems the kernel
* is aware of for mount and other syscalls. Returns 0 on success,
* or a negative errno code on an error.
*
* The &struct file_system_type that is passed is linked into the kernel
* structures and must not be freed until the file system has been
* unregistered.
*/
int register_filesystem(struct file_system_type * fs)
{
int res = 0;
struct file_system_type ** p;
BUG_ON(strchr(fs->name, '.'));
if (fs->next)
return -EBUSY;
write_lock(&file_systems_lock);
p = find_filesystem(fs->name, strlen(fs->name));
if (*p)
res = -EBUSY;
else
*p = fs;
write_unlock(&file_systems_lock);
return res;
}
EXPORT_SYMBOL(register_filesystem);
fuse 模块初始化
static int __init fuse_init(void)
{
int res;
printk(KERN_INFO "fuse init (API version %i.%i)\n",
FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
INIT_LIST_HEAD(&fuse_conn_list);
res = fuse_fs_init();
if (res)
goto err;
res = fuse_dev_init();
if (res)
goto err_fs_cleanup;
res = fuse_sysfs_init();
if (res)
goto err_dev_cleanup;
res = fuse_ctl_init();
if (res)
goto err_sysfs_cleanup;
sanitize_global_limit(&max_user_bgreq);
sanitize_global_limit(&max_user_congthresh);
return 0;
err_sysfs_cleanup:
fuse_sysfs_cleanup();
err_dev_cleanup:
fuse_dev_cleanup();
err_fs_cleanup:
fuse_fs_cleanup();
err:
return res;
}
fuse_fs_init 初始化注册fuse文件系统
fuse_sysfs_init 注册到 /sys/fs/fuse/connections sysfs,这是一个kobject
static int fuse_sysfs_init(void)
{
int err;
fuse_kobj = kobject_create_and_add("fuse", fs_kobj);
if (!fuse_kobj) {
err = -ENOMEM;
goto out_err;
}
err = sysfs_create_mount_point(fuse_kobj, "connections");
if (err)
goto out_fuse_unregister;
return 0;
out_fuse_unregister:
kobject_put(fuse_kobj);
out_err:
return err;
}
kobject_create_and_add(“fuse”, fs_kobj) 就是将 fuse加到 /sys/fs sysfs文件系统
kobject_put 为回收内存
最后是fuse_ctl_init,注册了fusectl 文件系统
内核 用户态通讯
/dev/fuse 的诞生
static struct miscdevice fuse_miscdevice = {
.minor = FUSE_MINOR,
.name = "fuse",
.fops = &fuse_dev_operations,
};
const struct file_operations fuse_dev_operations = {
.owner = THIS_MODULE,
.open = fuse_dev_open,
.llseek = no_llseek,
.read_iter = fuse_dev_read,
.splice_read = fuse_dev_splice_read,
.write_iter = fuse_dev_write,
.splice_write = fuse_dev_splice_write,
.poll = fuse_dev_poll,
.release = fuse_dev_release,
.fasync = fuse_dev_fasync,
.unlocked_ioctl = fuse_dev_ioctl,
.compat_ioctl = fuse_dev_ioctl,
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);
用户态 初始化过程
fuse_session_new -> fuse_session_mount -> fuse_kern_mount
static struct dentry *fuse_mount(struct file_system_type *fs_type,
int flags, const char *dev_name,
void *raw_data)
{
return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
}
fuse在mount的时候调用的是mount_nodev, mount_nodev: mount a filesystem that is not backed by a device
fuse_fill_super
在fuse_mount的时候就会调用 fuse_fill_super 来填充fuse的super_block
fuse_fill_super的过程除了填充superblock,还会初始化fuse_connection, fusedev, 初始化bdi
这时候fuse ko会给fuse userspace发送第一个请求 init 请求,请求会包含当前fuse所支持的特性,用户态可以在这个请求中告诉ko,哪些特性是不需要的
fuse_dev_read
用户态从kernl读取请求,都会从这个/dev/fuse设备中去读取,(从perf数据看来,目前fuse的瓶颈很大一块在这个地方,这里有大量的spinlock,如果配置的idle线程数较多,这里在spinlock上等待的时间越多)
文件操作
file_operations
内核中定义的file_operations 可以看做是一个代理,所有vfs过来的请求首先会执行ko里定义的op
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
.read_iter = fuse_file_read_iter,
.write_iter = fuse_file_write_iter,
.mmap = fuse_file_mmap,
.open = fuse_open,
.flush = fuse_flush,
.release = fuse_release,
.fsync = fuse_fsync,
.lock = fuse_file_lock,
.flock = fuse_file_flock,
.splice_read = generic_file_splice_read,
.unlocked_ioctl = fuse_file_ioctl,
.compat_ioctl = fuse_file_compat_ioctl,
.poll = fuse_file_poll,
.fallocate = fuse_file_fallocate,
};
以open为例
int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
bool lock_inode = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc &&
fc->writeback_cache;
...
kernel中的op会从inode结构体中获取fuse_conn, 然后将请求写到conn的pending list上,并唤醒等待在list上的处理请求,随后睡眠,等待reply的返回,(这里就会出现,如果用户态进程在接收到请求后异常退出,没有reply,那么将会把文件系统hang住),所以这里增加了一个fusectl控制文件,当fuse进程异常退出,被重新拉起后,首先要将 inflight的请求flush掉,并且最重要,要唤醒等在reply上的线程,当然,就和你想的那样,这些flush掉的请求都会返回eio,不过在4.19内核上,我们支持了请求重放的能力,这使得异常恢复变得更加顺滑
热升级是如何实现的
前面提到了 fuse_dev_read函数,通过/dev/fuse 设备读取 fuse request,所以我们只需要将这个设备open出来的fd保存下来(实际上就是新增这个fd的引用,防止被close),那么我们就可以通过 uds把这个fd send到新的进程中,新的进程就可以跳过mount过程,直接从这个设备读取fuse request