-
-
[原创] PageCache详细分析(读写/写回) 基于 Linux 6.12.32版本
-
发表于: 2025-12-4 17:04 2155
-
参考文章:
总览(概念上讲解):62cK9s2c8@1M7s2y4Q4x3@1q4Q4x3V1k6Q4x3V1k6*7K9s2g2S2L8X3I4S2L8W2)9J5k6i4A6Z5K9h3S2#2i4K6u0W2j5$3!0E0i4K6u0r3M7q4)9J5c8U0b7K6y4U0x3I4x3K6V1H3z5l9`.`.
PageCache的产生释放及优化:c7aK9s2c8@1M7s2y4Q4x3@1q4Q4x3V1k6Q4x3V1k6%4N6%4N6Q4x3X3g2U0L8X3u0D9L8$3N6K6i4K6u0W2j5$3!0E0i4K6u0r3M7$3!0X3N6r3y4D9L8%4g2V1i4K6u0r3M7q4)9J5c8U0p5$3y4U0b7&6x3K6t1$3i4K6u0W2K9s2c8E0L8l9`.`.
address_space结构解析:688K9s2c8@1M7s2y4Q4x3@1q4Q4x3V1k6Q4x3V1k6*7K9s2g2S2L8X3I4S2L8W2)9J5k6i4A6Z5K9h3S2#2i4K6u0W2j5$3!0E0i4K6u0r3M7q4)9J5c8U0f1@1x3o6R3H3y4U0R3%4y4R3`.`.
xarray详细解释(很厉害):c8dK9s2c8@1M7s2y4Q4x3@1q4Q4x3V1k6Q4x3V1k6*7K9s2g2S2L8X3I4S2L8W2)9J5k6i4A6Z5K9h3S2#2i4K6u0W2j5$3!0E0i4K6u0r3M7q4)9J5c8U0f1^5y4K6p5^5y4o6j5J5x3H3`.`.
本文较长,过程较为详细,请耐心阅读。本人水平有限,如果出现什么错误,请各位大佬指出。
本文分析的版本均为: Linux6.12.32
I_pages 的数据结构,用于管理PagesCaches。
XArray 是基于 Radix Tree(基数树)的实现。在 Linux 内核中 xarray 相比于树使用体验感更类似于一个无限增长的数组。
Node 中 slots 存放的 Entry ,有三种基本类型。
指向实际数据或子节点。
指向下一层 xa_node
标记节点正在被修改或即将释放,提示 RCU 读者重试
值条目,用于存储 swap/shadow 条目或标记指针。
现代Linux内核中,bdflush已被per-BDI(Backing Device Info)的写回线程替代。每个BDI有一个或多个bdi_writeback结构,每个bdi_writeback对应一个写回线程(通过workqueue实现)。通过这个机制来负责的 Linux 的脏数据回写。
Address_space 的创建(除 Swap Cache)主要分为两个板块(核心函数 alloc_inode)
路径1/2/3简述是下放的调用链
核心主要是调用到:inode_init_always函数
注意:inode_init_always_gfp 只初始化了 address_space 的基础字段,i_pages 的初始化在 inode_init_once() 中完成。
分配 inode 结构体,对 inode 进行初始化。这里我们可以很清晰看见,这里通过 ops->alloc_inode(sb)/alloc_inode_sb 进行 inode 结构体的获取,然后通过 inode_init_always 对 inode 进行初始化
i_pages 的初始化只有一个路径在(alloc_inode 内部触发):通过 inode_init_once() → __address_space_init_once()。
xa_init_flags() 的作用:
inode_init_once() 作为 slab 分配器的构造函数,在从 slab 分配 inode 时自动调用:
初始化 address_speace 的一些基础属性
其实很好理解,获取文件偏移后比如index:0x00100010,那么从root entry开始用shift开始提取slots的下标
比如: index: 0x010010,先获取根 enrty ,shift 为 12
继续下降,然后直到遇到了pointer entry/value entry/null entry 就返回。
无论是mmap映射文件还是通过open直接打开文件然后读取,都是拿到index后去查找PageCaches,如果没有命中PageCaches就创建 page ,再进行读取。
和读取一样,有多种路径
脏数据回写会在后文单独给出
本质上这些不同路径都是在向 BDI 提交任务,或者直接调用 bdi 的写回函数
功能:创建全局写回工作队列bdi_wq,用于执行所有写回工作。
#define FGP_ACCESSED ((__force fgf_t)0x00000001)#define FGP_LOCK ((__force fgf_t)0x00000002)#define FGP_CREAT ((__force fgf_t)0x00000004)#define FGP_WRITE ((__force fgf_t)0x00000008)#define FGP_NOFS ((__force fgf_t)0x00000010)#define FGP_NOWAIT ((__force fgf_t)0x00000020)#define FGP_FOR_MMAP ((__force fgf_t)0x00000040)#define FGP_STABLE ((__force fgf_t)0x00000080)#define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */#define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)#define FGP_ACCESSED ((__force fgf_t)0x00000001)#define FGP_LOCK ((__force fgf_t)0x00000002)#define FGP_CREAT ((__force fgf_t)0x00000004)#define FGP_WRITE ((__force fgf_t)0x00000008)#define FGP_NOFS ((__force fgf_t)0x00000010)#define FGP_NOWAIT ((__force fgf_t)0x00000020)#define FGP_FOR_MMAP ((__force fgf_t)0x00000040)#define FGP_STABLE ((__force fgf_t)0x00000080)#define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */#define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)struct address_space { struct inode *host; // 指向所属的inode struct xarray i_pages; // XArray:存储页缓存的radix tree struct rw_semaphore invalidate_lock; // 保护页面失效操作的读写锁 gfp_t gfp_mask; // 页面分配的内存标志 atomic_t i_mmap_writable; // 可写内存映射计数 struct rb_root_cached i_mmap; // 内存映射的RB树(用于跟踪VMA) unsigned long nrpages; // 页缓存中的页面总数 pgoff_t writeback_index; // 写回操作的起始索引 const struct address_space_operations *a_ops; // 地址空间操作函数集 unsigned long flags; // 标志位(AS_*) errseq_t wb_err; // 写回错误序列号 spinlock_t i_private_lock; // 私有数据保护锁 struct list_head i_private_list; // 私有数据链表 struct rw_semaphore i_mmap_rwsem; // 保护i_mmap的读写锁 void * i_private_data; // 文件系统私有数据指针}struct address_space { struct inode *host; // 指向所属的inode struct xarray i_pages; // XArray:存储页缓存的radix tree struct rw_semaphore invalidate_lock; // 保护页面失效操作的读写锁 gfp_t gfp_mask; // 页面分配的内存标志 atomic_t i_mmap_writable; // 可写内存映射计数 struct rb_root_cached i_mmap; // 内存映射的RB树(用于跟踪VMA) unsigned long nrpages; // 页缓存中的页面总数 pgoff_t writeback_index; // 写回操作的起始索引 const struct address_space_operations *a_ops; // 地址空间操作函数集 unsigned long flags; // 标志位(AS_*) errseq_t wb_err; // 写回错误序列号 spinlock_t i_private_lock; // 私有数据保护锁 struct list_head i_private_list; // 私有数据链表 struct rw_semaphore i_mmap_rwsem; // 保护i_mmap的读写锁 void * i_private_data; // 文件系统私有数据指针}struct xarray { spinlock_t xa_lock;/* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head;};struct xarray { spinlock_t xa_lock;/* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head;};#ifndef XA_CHUNK_SHIFT#define XA_CHUNK_SHIFT (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6)#endif#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT)#define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1)#ifndef XA_CHUNK_SHIFT#define XA_CHUNK_SHIFT (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6)#endif#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT)#define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1)XArray树结构(3级示例): xa_head │ ▼ [xa_node (shift=12)] │ ┌─────────────┼─────────────┐ │ │ │ slots[0] slots[1] slots[2] │ │ │ ▼ ▼ ▼ [xa_node] [xa_node] NULL (shift=6) (shift=6) │ │ ┌───┼───┐ ┌───┼───┐ │ │ │ │ │ │ slots[0] slots[1] slots[2] slots[3] │ │ │ │ │ │ ▼ ▼ ▼ ▼ ▼ ▼ folio folio folio folio folio (0) (1) (2) (3) (4)索引计算:- index = 0: 0 >> 12 = 0, 0 >> 6 = 0, 0 & 63 = 0 → slots[0][0]- index = 1: 1 >> 12 = 0, 1 >> 6 = 0, 1 & 63 = 1 → slots[0][1]- index = 64: 64 >> 12 = 0, 64 >> 6 = 1, 64 & 63 = 0 → slots[1][0]XArray树结构(3级示例): xa_head │ ▼ [xa_node (shift=12)] │ ┌─────────────┼─────────────┐ │ │ │ slots[0] slots[1] slots[2] │ │ │ ▼ ▼ ▼ [xa_node] [xa_node] NULL (shift=6) (shift=6) │ │ ┌───┼───┐ ┌───┼───┐ │ │ │ │ │ │ slots[0] slots[1] slots[2] slots[3] │ │ │ │ │ │ ▼ ▼ ▼ ▼ ▼ ▼ folio folio folio folio folio (0) (1) (2) (3) (4)索引计算:- index = 0: 0 >> 12 = 0, 0 >> 6 = 0, 0 & 63 = 0 → slots[0][0]- index = 1: 1 >> 12 = 0, 1 >> 6 = 0, 1 & 63 = 1 → slots[0][1]- index = 64: 64 >> 12 = 0, 64 >> 6 = 1, 64 & 63 = 0 → slots[1][0]struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; };};struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; };};Entry (void *)│├─ [00] Pointer Entry (普通指针条目)│ ├─ NULL Entry (空指针)│ └─ 普通指针 (指向实际数据对象)│├─ [10] Internal Entry (内部条目)│ ├─ Node Pointer (>4096)│ ├─ Sibling Entry (0-62)│ ├─ Retry Entry (256)│ ├─ Zero Entry (257)│ └─ Error Entry (-4094 ~ -2)│└─ [x1] Value Entry (值条目) ├─ 纯值条目 (低1位=1, 低2位=01) └─ Tagged Pointer (低2位=11)Entry (void *)│├─ [00] Pointer Entry (普通指针条目)│ ├─ NULL Entry (空指针)│ └─ 普通指针 (指向实际数据对象)│├─ [10] Internal Entry (内部条目)│ ├─ Node Pointer (>4096)│ ├─ Sibling Entry (0-62)│ ├─ Retry Entry (256)│ ├─ Zero Entry (257)│ └─ Error Entry (-4094 ~ -2)│└─ [x1] Value Entry (值条目) ├─ 纯值条目 (低1位=1, 低2位=01) └─ Tagged Pointer (低2位=11)// NULL指针本身就是NULL entryvoid *entry = NULL;// NULL指针本身就是NULL entryvoid *entry = NULL;// 示例:指向struct folio的指针void *entry = folio; // 低2位 = 00// 示例:指向struct folio的指针void *entry = folio; // 低2位 = 00static inline void *xa_mk_node(const struct xa_node *node){ return (void *)((unsigned long)node | 2);}/* Private */static inline struct xa_node *xa_to_node(const void *entry){ return (struct xa_node *)((unsigned long)entry - 2);}static inline void *xa_mk_node(const struct xa_node *node){ return (void *)((unsigned long)node | 2);}/* Private */static inline struct xa_node *xa_to_node(const void *entry){ return (struct xa_node *)((unsigned long)entry - 2);}static inline bool xa_is_node(const void *entry){ return xa_is_internal(entry) && (unsigned long)entry > 4096;}static inline bool xa_is_node(const void *entry){ return xa_is_internal(entry) && (unsigned long)entry > 4096;}假设一个64KB的大页面(order=4),占用16个4KB页面的索引:索引100-115都指向同一个folio存储方式: i_pages[100] = folio指针 ← 规范槽位(canonical slot) i_pages[101] = sibling(100) ← 指向槽位100 i_pages[102] = sibling(100) ... i_pages[115] = sibling(100)查找索引103时: xas_descend() → 发现sibling(100) → 跳转到槽位100 → 返回folio指针// 创建函数static inline void *xa_mk_sibling(unsigned int offset){ return xa_mk_internal(offset);}// 识别函数static inline bool xa_is_sibling(const void *entry){ return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));}// 提取函数static inline unsigned long xa_to_sibling(const void *entry){ return xa_to_internal(entry);}假设一个64KB的大页面(order=4),占用16个4KB页面的索引:索引100-115都指向同一个folio存储方式: i_pages[100] = folio指针 ← 规范槽位(canonical slot) i_pages[101] = sibling(100) ← 指向槽位100 i_pages[102] = sibling(100) ... i_pages[115] = sibling(100)查找索引103时: xas_descend() → 发现sibling(100) → 跳转到槽位100 → 返回folio指针// 创建函数static inline void *xa_mk_sibling(unsigned int offset){ return xa_mk_internal(offset);}// 识别函数static inline bool xa_is_sibling(const void *entry){ return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));}// 提取函数static inline unsigned long xa_to_sibling(const void *entry){ return xa_to_internal(entry);}#define XA_RETRY_ENTRY xa_mk_internal(256)/** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */static inline bool xa_is_retry(const void *entry){ return unlikely(entry == XA_RETRY_ENTRY);}#define XA_RETRY_ENTRY xa_mk_internal(256)/** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */static inline bool xa_is_retry(const void *entry){ return unlikely(entry == XA_RETRY_ENTRY);}#define XA_ZERO_ENTRY xa_mk_internal(257)/** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */static inline bool xa_is_zero(const void *entry){ return unlikely(entry == XA_ZERO_ENTRY);}static inline __must_checkint xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp){ return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));}#define XA_ZERO_ENTRY xa_mk_internal(257)/** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */static inline bool xa_is_zero(const void *entry){ return unlikely(entry == XA_ZERO_ENTRY);}static inline __must_checkint xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp){ return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));}static inline bool xa_is_err(const void *entry){ return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO));}static inline bool xa_is_err(const void *entry){ return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO));}static inline void *xa_mk_value(unsigned long v){ WARN_ON((long)v < 0); return (void *)((v << 1) | 1);}/** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */static inline unsigned long xa_to_value(const void *entry){ return (unsigned long)entry >> 1;}/** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */static inline bool xa_is_value(const void *entry){ return (unsigned long)entry & 1;}// 列子// 创建值条目void *entry = xa_mk_value(123); // 将整数123编码为值条目// 提取值unsigned long value = xa_to_value(entry); // 返回123// 判断是否为值条目if (xa_is_value(entry)) { // 这是值条目,不是指针}static inline void *xa_mk_value(unsigned long v){ WARN_ON((long)v < 0); return (void *)((v << 1) | 1);}/** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */static inline unsigned long xa_to_value(const void *entry){ return (unsigned long)entry >> 1;}/** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */static inline bool xa_is_value(const void *entry){ return (unsigned long)entry & 1;}// 列子// 创建值条目void *entry = xa_mk_value(123); // 将整数123编码为值条目// 提取值unsigned long value = xa_to_value(entry); // 返回123// 判断是否为值条目if (xa_is_value(entry)) { // 这是值条目,不是指针}struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ unsigned long write_bandwidth; /* the estimated write bandwidth */ unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */ /* * The base dirty throttle rate, re-calculated on every 200ms. * All the bdi tasks' dirty rate will be curbed under it. * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit * in small steps and is much more smooth/stable than the latter. */ unsigned long dirty_ratelimit; unsigned long balanced_dirty_ratelimit; struct fprop_local_percpu completions; int dirty_exceeded; enum wb_reason start_all_reason; spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ struct list_head bdi_node; /* anchored at bdi->wb_list */#ifdef CONFIG_CGROUP_WRITEBACK struct percpu_ref refcnt; /* used only for !root wb's */ struct fprop_local_percpu memcg_completions; struct cgroup_subsys_state *memcg_css; /* the associated memcg */ struct cgroup_subsys_state *blkcg_css; /* and blkcg */ struct list_head memcg_node; /* anchored at memcg->cgwb_list */ struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ struct list_head offline_node; /* anchored at offline_cgwbs */ union { struct work_struct release_work; struct rcu_head rcu; };#endif};struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ unsigned long write_bandwidth; /* the estimated write bandwidth */ unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */ /* * The base dirty throttle rate, re-calculated on every 200ms. * All the bdi tasks' dirty rate will be curbed under it. * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit * in small steps and is much more smooth/stable than the latter. */ unsigned long dirty_ratelimit; unsigned long balanced_dirty_ratelimit; struct fprop_local_percpu completions; int dirty_exceeded; enum wb_reason start_all_reason; spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ struct list_head bdi_node; /* anchored at bdi->wb_list */#ifdef CONFIG_CGROUP_WRITEBACK struct percpu_ref refcnt; /* used only for !root wb's */ struct fprop_local_percpu memcg_completions; struct cgroup_subsys_state *memcg_css; /* the associated memcg */ struct cgroup_subsys_state *blkcg_css; /* and blkcg */ struct list_head memcg_node; /* anchored at memcg->cgwb_list */ struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ struct list_head offline_node; /* anchored at offline_cgwbs */ union { struct work_struct release_work; struct rcu_head rcu; };#endif};struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ struct wb_completion *done; /* set if the caller waits */};struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ struct wb_completion *done; /* set if the caller waits */};alloc_inode(sb) │ ├─> ops->alloc_inode(sb) 或 alloc_inode_sb() │ └─> kmem_cache_alloc(ext4_inode_cachep, GFP_KERNEL) │ │ │ └─> [slab分配器内部] │ └─> 如果是新分配的对象: │ └─> init_once(foo) [slab构造函数] │ └─> inode_init_once(inode) │ ├─> memset(inode, 0, ...) // 清零整个结构体 │ ├─> 初始化各种链表头 │ └─> __address_space_init_once(&inode->i_data) │ └─> xa_init_flags(&mapping->i_pages, ...) // ✅ i_pages初始化 │ └─> inode_init_always(sb, inode) └─> inode_init_always_gfp(sb, inode, GFP_NOFS) ├─> 初始化inode基础字段 └─> 初始化address_space运行时属性 ├─> mapping->a_ops = &empty_aops ├─> mapping->host = inode ├─> mapping_set_gfp_mask(...) └─> inode->i_mapping = mappingalloc_inode(sb) │ ├─> ops->alloc_inode(sb) 或 alloc_inode_sb() │ └─> kmem_cache_alloc(ext4_inode_cachep, GFP_KERNEL) │ │ │ └─> [slab分配器内部] │ └─> 如果是新分配的对象: │ └─> init_once(foo) [slab构造函数] │ └─> inode_init_once(inode) │ ├─> memset(inode, 0, ...) // 清零整个结构体 │ ├─> 初始化各种链表头 │ └─> __address_space_init_once(&inode->i_data) │ └─> xa_init_flags(&mapping->i_pages, ...) // ✅ i_pages初始化 │ └─> inode_init_always(sb, inode) └─> inode_init_always_gfp(sb, inode, GFP_NOFS) ├─> 初始化inode基础字段 └─> 初始化address_space运行时属性 ├─> mapping->a_ops = &empty_aops ├─> mapping->host = inode ├─> mapping_set_gfp_mask(...) └─> inode->i_mapping = mapping场景1:ext4文件系统挂载,创建根inode └─> ext4_iget(sb, EXT4_ROOT_INO, ...) └─> iget_locked(sb, ino) └─> alloc_inode(sb) └─> ext4_alloc_inode() └─> kmem_cache_alloc(ext4_inode_cachep) └─> init_once()被调用(slab构造函数) └─> inode_init_once() └─> __address_space_init_once() └─> xa_init_flags(&i_pages, ...) // i_pages初始化 └─> inode_init_always(sb, inode) └─> inode_init_always_gfp() └─> 初始化address_space其他字段 ├─> mapping->a_ops = &empty_aops ├─> mapping->host = inode ├─> mapping_set_gfp_mask(...) └─> inode->i_mapping = mapping场景2:创建新文件 └─> ext4_new_inode() └─> new_inode(sb) └─> alloc_inode(sb) └─> 同上,init_once()初始化i_pages └─> inode_init_always()初始化其他字段场景3:打开已存在文件 └─> ext4_iget() └─> iget_locked() └─> 如果inode不在缓存中: └─> alloc_inode() └─> 同上流程场景1:ext4文件系统挂载,创建根inode └─> ext4_iget(sb, EXT4_ROOT_INO, ...) └─> iget_locked(sb, ino) └─> alloc_inode(sb) └─> ext4_alloc_inode() └─> kmem_cache_alloc(ext4_inode_cachep) └─> init_once()被调用(slab构造函数) └─> inode_init_once() └─> __address_space_init_once() └─> xa_init_flags(&i_pages, ...) // i_pages初始化 └─> inode_init_always(sb, inode) └─> inode_init_always_gfp() └─> 初始化address_space其他字段 ├─> mapping->a_ops = &empty_aops ├─> mapping->host = inode ├─> mapping_set_gfp_mask(...) └─> inode->i_mapping = mapping场景2:创建新文件 └─> ext4_new_inode() └─> new_inode(sb) └─> alloc_inode(sb) └─> 同上,init_once()初始化i_pages └─> inode_init_always()初始化其他字段场景3:打开已存在文件 └─> ext4_iget() └─> iget_locked() └─> 如果inode不在缓存中: └─> alloc_inode() └─> 同上流程用户空间:open("newfile", O_CREAT) └─> open系统调用 └─> do_sys_open() [fs/open.c] └─> do_filp_open() [fs/namei.c] └─> path_openat() [fs/namei.c] └─> do_open() [fs/namei.c] └─> vfs_create() [fs/namei.c] └─> ext4_create() [fs/ext4/namei.c] └─> ext4_new_inode_start_handle() [fs/ext4/namei.c] └─> __ext4_new_inode() [fs/ext4/ialloc.c:924] ├─> new_inode(sb) [fs/inode.c:1121] │ └─> new_inode_pseudo(sb) │ └─> alloc_inode(sb) │ └─> ext4_alloc_inode() │ └─> kmem_cache_alloc(ext4_inode_cachep) │ └─> 分配时调用init_once回调 │ └─> inode_init_once() [fs/inode.c:424] │ └─> __address_space_init_once(&inode->i_data) │ └─> xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT) │ └─> i_pages初始化完成! │ └─> inode_init_always(sb, inode) └─> 初始化address_space的其他字段用户空间:open("newfile", O_CREAT) └─> open系统调用 └─> do_sys_open() [fs/open.c] └─> do_filp_open() [fs/namei.c] └─> path_openat() [fs/namei.c] └─> do_open() [fs/namei.c] └─> vfs_create() [fs/namei.c] └─> ext4_create() [fs/ext4/namei.c] └─> ext4_new_inode_start_handle() [fs/ext4/namei.c] └─> __ext4_new_inode() [fs/ext4/ialloc.c:924] ├─> new_inode(sb) [fs/inode.c:1121] │ └─> new_inode_pseudo(sb) │ └─> alloc_inode(sb) │ └─> ext4_alloc_inode() │ └─> kmem_cache_alloc(ext4_inode_cachep) │ └─> 分配时调用init_once回调 │ └─> inode_init_once() [fs/inode.c:424] │ └─> __address_space_init_once(&inode->i_data) │ └─> xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT) │ └─> i_pages初始化完成! │ └─> inode_init_always(sb, inode) └─> 初始化address_space的其他字段用户空间:open("existing_file", O_RDONLY) └─> open系统调用 └─> do_sys_open() └─> do_filp_open() └─> path_openat() └─> do_open() └─> vfs_open() └─> d_inode(path->dentry) └─> 如果inode不在缓存中: └─> ext4_lookup() [fs/ext4/namei.c] └─> ext4_iget(sb, ino, EXT4_IGET_NORMAL) └─> __ext4_iget() └─> iget_locked(sb, ino) └─> alloc_inode(sb) └─> 同上,通过slab分配器分配 └─> inode_init_once()被调用 └─> i_pages初始化用户空间:open("existing_file", O_RDONLY) └─> open系统调用 └─> do_sys_open() └─> do_filp_open() └─> path_openat() └─> do_open() └─> vfs_open() └─> d_inode(path->dentry) └─> 如果inode不在缓存中: └─> ext4_lookup() [fs/ext4/namei.c] └─> ext4_iget(sb, ino, EXT4_IGET_NORMAL) └─> __ext4_iget() └─> iget_locked(sb, ino) └─> alloc_inode(sb) └─> 同上,通过slab分配器分配 └─> inode_init_once()被调用 └─> i_pages初始化内核启动或添加swap分区 └─> swapon系统调用 └─> do_swapoff() [mm/swapfile.c] └─> setup_swap_extents() [mm/swapfile.c] └─> init_swap_address_space() [mm/swap_state.c:710] ├─> 计算需要的address_space数量 │ └─> nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES) ├─> kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL) │ └─> 分配多个address_space结构 └─> 对每个address_space初始化: ├─> xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ) │ └─> 注意:没有XA_FLAGS_ACCOUNT标志 ├─> atomic_set(&space->i_mmap_writable, 0) ├─> space->a_ops = &swap_aops └─> mapping_set_no_writeback_tags(space) └─> swap cache不使用writeback相关标记内核启动或添加swap分区 └─> swapon系统调用 └─> do_swapoff() [mm/swapfile.c] └─> setup_swap_extents() [mm/swapfile.c] └─> init_swap_address_space() [mm/swap_state.c:710] ├─> 计算需要的address_space数量 │ └─> nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES) ├─> kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL) │ └─> 分配多个address_space结构 └─> 对每个address_space初始化: ├─> xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ) │ └─> 注意:没有XA_FLAGS_ACCOUNT标志 ├─> atomic_set(&space->i_mmap_writable, 0) ├─> space->a_ops = &swap_aops └─> mapping_set_no_writeback_tags(space) └─> swap cache不使用writeback相关标记static struct inode *alloc_inode(struct super_block *sb){ const struct super_operations *ops = sb->s_op; struct inode *inode; if (ops->alloc_inode) inode = ops->alloc_inode(sb); else inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL); if (!inode) return NULL; if (unlikely(inode_init_always(sb, inode))) { if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return NULL; } inode->free_inode = ops->free_inode; i_callback(&inode->i_rcu); return NULL; } return inode;}static struct inode *alloc_inode(struct super_block *sb){ const struct super_operations *ops = sb->s_op; struct inode *inode; if (ops->alloc_inode) inode = ops->alloc_inode(sb); else inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL); if (!inode) return NULL; if (unlikely(inode_init_always(sb, inode))) { if (ops->destroy_inode) { ops->destroy_inode(inode); if (!ops->free_inode) return NULL; } inode->free_inode = ops->free_inode; i_callback(&inode->i_rcu); return NULL; } return inode;}void inode_init_once(struct inode *inode){ memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); INIT_LIST_HEAD(&inode->i_sb_list); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode);}static void __address_space_init_once(struct address_space *mapping){ xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->i_private_list); spin_lock_init(&mapping->i_private_lock); mapping->i_mmap = RB_ROOT_CACHED;}void inode_init_once(struct inode *inode){ memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); INIT_LIST_HEAD(&inode->i_sb_list); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode);}static void __address_space_init_once(struct address_space *mapping){ xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->i_private_list); spin_lock_init(&mapping->i_private_lock); mapping->i_mmap = RB_ROOT_CACHED;}ext4模块加载 └─> module_init(init_ext4_fs) [fs/ext4/super.c] └─> ext4_init_inode_table() [fs/ext4/super.c] └─> ext4_inode_cachep = kmem_cache_create_usercopy( "ext4_inode_cache", sizeof(struct ext4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT), offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once) // 关键:init_once作为构造函数 └─> kmem_cache_create_usercopy() [mm/slab_common.c] └─> __kmem_cache_create() └─> 创建slab缓存,注册init_once回调后续任何分配inode的操作: └─> alloc_inode(sb) [fs/inode.c:261] └─> ext4_alloc_inode(sb) [fs/ext4/super.c] └─> kmem_cache_alloc(ext4_inode_cachep, GFP_KERNEL) └─> slab_alloc() [mm/slub.c] └─> 如果对象是新分配的(未初始化): └─> slab_post_alloc_hook() └─> 调用构造函数:init_once(foo) └─> inode_init_once(inode) └─> memset(inode, 0, sizeof(*inode)) └─> 清零整个inode结构 └─> INIT_HLIST_NODE(&inode->i_hash) └─> INIT_LIST_HEAD(&inode->i_devices) └─> INIT_LIST_HEAD(&inode->i_io_list) └─> INIT_LIST_HEAD(&inode->i_wb_list) └─> INIT_LIST_HEAD(&inode->i_lru) └─> INIT_LIST_HEAD(&inode->i_sb_list) └─> __address_space_init_once(&inode->i_data) ├─> xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT) │ └─> 初始化XArray结构 │ ├─> mapping->i_pages.xa_head = NULL │ ├─> mapping->i_pages.xa_flags = XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT │ └─> 初始化xa_lock(IRQ安全的spinlock) ├─> init_rwsem(&mapping->i_mmap_rwsem) ├─> INIT_LIST_HEAD(&mapping->i_private_list) ├─> spin_lock_init(&mapping->i_private_lock) └─> mapping->i_mmap = RB_ROOT_CACHED └─> i_size_ordered_init(inode)ext4模块加载 └─> module_init(init_ext4_fs) [fs/ext4/super.c] └─> ext4_init_inode_table() [fs/ext4/super.c] └─> ext4_inode_cachep = kmem_cache_create_usercopy( "ext4_inode_cache", sizeof(struct ext4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT), offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once) // 关键:init_once作为构造函数 └─> kmem_cache_create_usercopy() [mm/slab_common.c] └─> __kmem_cache_create() └─> 创建slab缓存,注册init_once回调后续任何分配inode的操作: └─> alloc_inode(sb) [fs/inode.c:261] └─> ext4_alloc_inode(sb) [fs/ext4/super.c] └─> kmem_cache_alloc(ext4_inode_cachep, GFP_KERNEL) └─> slab_alloc() [mm/slub.c] └─> 如果对象是新分配的(未初始化): └─> slab_post_alloc_hook() └─> 调用构造函数:init_once(foo) └─> inode_init_once(inode) └─> memset(inode, 0, sizeof(*inode)) └─> 清零整个inode结构 └─> INIT_HLIST_NODE(&inode->i_hash) └─> INIT_LIST_HEAD(&inode->i_devices) └─> INIT_LIST_HEAD(&inode->i_io_list) └─> INIT_LIST_HEAD(&inode->i_wb_list) └─> INIT_LIST_HEAD(&inode->i_lru) └─> INIT_LIST_HEAD(&inode->i_sb_list) └─> __address_space_init_once(&inode->i_data) ├─> xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT) │ └─> 初始化XArray结构 │ ├─> mapping->i_pages.xa_head = NULL │ ├─> mapping->i_pages.xa_flags = XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT │ └─> 初始化xa_lock(IRQ安全的spinlock) ├─> init_rwsem(&mapping->i_mmap_rwsem) ├─> INIT_LIST_HEAD(&mapping->i_private_list) ├─> spin_lock_init(&mapping->i_private_lock) └─> mapping->i_mmap = RB_ROOT_CACHED └─> i_size_ordered_init(inode)int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp){ static const struct inode_operations empty_iops; static const struct file_operations no_open_fops = {.open = no_open}; struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; inode->i_state = 0; atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; inode->i_ino = 0; inode->__i_nlink = 1; inode->i_opflags = 0; if (sb->s_xattr) inode->i_opflags |= IOP_XATTR; i_uid_write(inode, 0); i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; inode->i_pipe = NULL; inode->i_cdev = NULL; inode->i_link = NULL; inode->i_dir_seq = 0; inode->i_rdev = 0; inode->dirtied_when = 0;#ifdef CONFIG_CGROUP_WRITEBACK inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0;#endif spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); atomic_set(&inode->i_dio_count, 0); mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0);#ifdef CONFIG_READ_ONLY_THP_FOR_FS atomic_set(&mapping->nr_thps, 0);#endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->i_private_data = NULL; mapping->writeback_index = 0; init_rwsem(&mapping->invalidate_lock); lockdep_set_class_and_name(&mapping->invalidate_lock, &sb->s_type->invalidate_lock_key, "mapping.invalidate_lock"); if (sb->s_iflags & SB_I_STABLE_WRITES) mapping_set_stable_writes(mapping); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */#ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;#endif#ifdef CONFIG_FSNOTIFY inode->i_fsnotify_mask = 0;#endif inode->i_flctx = NULL; if (unlikely(security_inode_alloc(inode, gfp))) return -ENOMEM; this_cpu_inc(nr_inodes); return 0;}int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp){ static const struct inode_operations empty_iops; static const struct file_operations no_open_fops = {.open = no_open}; struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; inode->i_state = 0; atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; inode->i_ino = 0; inode->__i_nlink = 1; inode->i_opflags = 0; if (sb->s_xattr) inode->i_opflags |= IOP_XATTR; i_uid_write(inode, 0); i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_write_hint = WRITE_LIFE_NOT_SET; inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; inode->i_pipe = NULL; inode->i_cdev = NULL; inode->i_link = NULL; inode->i_dir_seq = 0; inode->i_rdev = 0; inode->dirtied_when = 0;#ifdef CONFIG_CGROUP_WRITEBACK inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0;#endif spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); init_rwsem(&inode->i_rwsem); lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); atomic_set(&inode->i_dio_count, 0); mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0);#ifdef CONFIG_READ_ONLY_THP_FOR_FS atomic_set(&mapping->nr_thps, 0);#endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->i_private_data = NULL; mapping->writeback_index = 0; init_rwsem(&mapping->invalidate_lock); lockdep_set_class_and_name(&mapping->invalidate_lock, &sb->s_type->invalidate_lock_key, "mapping.invalidate_lock"); if (sb->s_iflags & SB_I_STABLE_WRITES) mapping_set_stable_writes(mapping); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */#ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;#endif#ifdef CONFIG_FSNOTIFY inode->i_fsnotify_mask = 0;#endif inode->i_flctx = NULL; if (unlikely(security_inode_alloc(inode, gfp))) return -ENOMEM; this_cpu_inc(nr_inodes); return 0;}slots_index = 0x10010 >> shift & 0x3f // 为 16 < XA_CHUNK_SIZE 合法entry = node->slots[slots_index] slots_index = 0x10010 >> shift & 0x3f // 为 16 < XA_CHUNK_SIZE 合法entry = node->slots[slots_index] ┌─────────────────────────────────────────────────────────────┐│ filemap_get_folio(mapping, index) │└──────────────────────┬──────────────────────────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ __filemap_get_folio() ││ - 初始化folio = NULL │└──────────────────────┬──────────────────────────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ filemap_get_entry(mapping, index) ││ - XA_STATE(xas, &mapping->i_pages, index) ││ - rcu_read_lock() │└──────────────────────┬──────────────────────────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_reset(&xas) ││ - xas->xa_node = XAS_RESTART │└──────────────────────┬──────────────────────────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_load(&xas) │└──────────────────────┬──────────────────────────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_start(&xas) ││ ├─ xas_valid()? → 检查状态 ││ ├─ xa_head(xas->xa) → 获取根节点 ││ └─ 检查index范围 │└──────────────────────┬──────────────────────────────────────┘ │ ▼ ┌─────────────┴─────────────┐ │ │ xa_is_node(entry)? 不是节点 │ │ ▼ ▼┌──────────────────┐ ┌──────────────────┐│ xa_to_node() │ │ 返回entry ││ 转换节点指针 │ │ (可能是folio/NULL)│└────────┬─────────┘ └──────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_descend(&xas, node) ││ ├─ get_offset(index, node) ││ │ └─ (index >> node->shift) & XA_CHUNK_MASK ││ ├─ xa_entry(node, offset) → 读取slot ││ └─ while (xa_is_sibling(entry)) ││ └─ 跳转到规范槽位 │└──────────────────────┬──────────────────────────────────────┘ │ ▼ ┌─────────────┴─────────────┐ │ │ xa_is_node(entry)? 不是节点 │ │ ▼ ▼ 继续下降 返回entry │ │ └───────────┬───────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_retry(&xas, entry) ││ ├─ xa_is_zero()? → 返回true,重试 ││ ├─ xa_is_retry()? → xas_reset(),返回true,重试 ││ └─ 返回false,继续 │└──────────────────────┬──────────────────────────────────────┘ │ ▼ ┌─────────────┴─────────────┐ │ │ xa_is_value()? 不是value │ │ ▼ ▼┌──────────────────┐ ┌──────────────────┐│ 返回shadow/swap │ │ folio_try_get() ││ entry,不增加引用│ │ 增加引用计数 │└──────────────────┘ └────────┬──────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_reload(&xas) ││ - 重新读取slot,验证entry未改变 │└──────────────────────┬──────────────────────────────────────┘ │ ▼ ┌─────────────┴─────────────┐ │ │ entry改变了? 未改变 │ │ ▼ ▼ folio_put() 返回folio goto repeat rcu_read_unlock()┌─────────────────────────────────────────────────────────────┐│ filemap_get_folio(mapping, index) │└──────────────────────┬──────────────────────────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ __filemap_get_folio() ││ - 初始化folio = NULL │└──────────────────────┬──────────────────────────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ filemap_get_entry(mapping, index) ││ - XA_STATE(xas, &mapping->i_pages, index) ││ - rcu_read_lock() │└──────────────────────┬──────────────────────────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_reset(&xas) ││ - xas->xa_node = XAS_RESTART │└──────────────────────┬──────────────────────────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_load(&xas) │└──────────────────────┬──────────────────────────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_start(&xas) ││ ├─ xas_valid()? → 检查状态 ││ ├─ xa_head(xas->xa) → 获取根节点 ││ └─ 检查index范围 │└──────────────────────┬──────────────────────────────────────┘ │ ▼ ┌─────────────┴─────────────┐ │ │ xa_is_node(entry)? 不是节点 │ │ ▼ ▼┌──────────────────┐ ┌──────────────────┐│ xa_to_node() │ │ 返回entry ││ 转换节点指针 │ │ (可能是folio/NULL)│└────────┬─────────┘ └──────────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_descend(&xas, node) ││ ├─ get_offset(index, node) ││ │ └─ (index >> node->shift) & XA_CHUNK_MASK ││ ├─ xa_entry(node, offset) → 读取slot ││ └─ while (xa_is_sibling(entry)) ││ └─ 跳转到规范槽位 │└──────────────────────┬──────────────────────────────────────┘ │ ▼ ┌─────────────┴─────────────┐ │ │ xa_is_node(entry)? 不是节点 │ │ ▼ ▼ 继续下降 返回entry │ │ └───────────┬───────────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_retry(&xas, entry) ││ ├─ xa_is_zero()? → 返回true,重试 ││ ├─ xa_is_retry()? → xas_reset(),返回true,重试 ││ └─ 返回false,继续 │└──────────────────────┬──────────────────────────────────────┘ │ ▼ ┌─────────────┴─────────────┐ │ │ xa_is_value()? 不是value │ │ ▼ ▼┌──────────────────┐ ┌──────────────────┐│ 返回shadow/swap │ │ folio_try_get() ││ entry,不增加引用│ │ 增加引用计数 │└──────────────────┘ └────────┬──────────┘ │ ▼┌─────────────────────────────────────────────────────────────┐│ xas_reload(&xas) ││ - 重新读取slot,验证entry未改变 │└──────────────────────┬──────────────────────────────────────┘ │ ▼ ┌─────────────┴─────────────┐ │ │ entry改变了? 未改变 │ │ ▼ ▼ folio_put() 返回folio goto repeat rcu_read_unlock()index的二进制表示: [高位] ... [中间位] [低位] │ │ │ │ │ └─> 叶子节点槽位(shift=0) │ └──────────> 中间节点槽位(shift=6) └────────────────────> 根节点槽位(shift=12)static unsigned int get_offset(unsigned long index, struct xa_node *node){ return (index >> node->shift) & XA_CHUNK_MASK;}index的二进制表示: [高位] ... [中间位] [低位] │ │ │ │ │ └─> 叶子节点槽位(shift=0) │ └──────────> 中间节点槽位(shift=6) └────────────────────> 根节点槽位(shift=12)static unsigned int get_offset(unsigned long index, struct xa_node *node){ return (index >> node->shift) & XA_CHUNK_MASK;}Level 0 (根节点, shift=12): offset = (0x1234 >> 12) & 0x3F = 0x1 & 0x3F = 1 → 访问 slots[1]Level 1 (中间节点, shift=6): offset = (0x1234 >> 6) & 0x3F = 0x48 & 0x3F = 0x08 = 8 → 访问 slots[8]Level 2 (叶子节点, shift=0): offset = (0x1234 >> 0) & 0x3F = 0x1234 & 0x3F = 0x34 = 52 → 访问 slots[52]Level 0 (根节点, shift=12): offset = (0x1234 >> 12) & 0x3F = 0x1 & 0x3F = 1 → 访问 slots[1]Level 1 (中间节点, shift=6): offset = (0x1234 >> 6) & 0x3F = 0x48 & 0x3F = 0x08 = 8 → 访问 slots[8]Level 2 (叶子节点, shift=0): offset = (0x1234 >> 0) & 0x3F = 0x1234 & 0x3F = 0x34 = 52 → 访问 slots[52]filemap_get_pages() │ ├─> 步骤1:计算页索引范围 │ ├─> index = iocb->ki_pos >> PAGE_SHIFT │ └─> last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE) │ ├─> 步骤2:第一次尝试批量查找 │ └─> filemap_get_read_batch(mapping, index, last_index - 1, fbatch) │ └─> 在i_pages中查找页面 │ ├─> 步骤3:如果未找到页面 │ ├─> 检查IOCB_NOIO标志 │ ├─> 执行同步预读 │ │ └─> page_cache_sync_readahead() │ └─> 再次尝试批量查找 │ └─> filemap_get_read_batch() │ ├─> 步骤4:如果仍未找到 │ └─> filemap_create_folio() │ └─> 创建新页面并加入Pages Cache │ ├─> 步骤5:处理找到的页面 │ ├─> 检查readahead标志 │ ├─> 检查uptodate标志 │ └─> filemap_update_page()(如果需要) │ └─> 返回0(成功)或错误码static int filemap_get_pages(struct kiocb *iocb, size_t count, struct folio_batch *fbatch, bool need_uptodate){ struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; unsigned int flags; int err = 0; /* "last_index" is the index of the page beyond the end of the read */ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);retry: if (fatal_signal_pending(current)) return -EINTR; filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } folio = fbatch->folios[folio_batch_count(fbatch) - 1]; if (folio_test_readahead(folio)) { err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } if (!folio_test_uptodate(folio)) { if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) goto err; } trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0;err: if (err < 0) folio_put(folio); if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; return err;}filemap_get_pages() │ ├─> 步骤1:计算页索引范围 │ ├─> index = iocb->ki_pos >> PAGE_SHIFT │ └─> last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE) │ ├─> 步骤2:第一次尝试批量查找 │ └─> filemap_get_read_batch(mapping, index, last_index - 1, fbatch) │ └─> 在i_pages中查找页面 │ ├─> 步骤3:如果未找到页面 │ ├─> 检查IOCB_NOIO标志 │ ├─> 执行同步预读 │ │ └─> page_cache_sync_readahead() │ └─> 再次尝试批量查找 │ └─> filemap_get_read_batch() │ ├─> 步骤4:如果仍未找到 │ └─> filemap_create_folio() │ └─> 创建新页面并加入Pages Cache │ ├─> 步骤5:处理找到的页面 │ ├─> 检查readahead标志 │ ├─> 检查uptodate标志 │ └─> filemap_update_page()(如果需要) │ └─> 返回0(成功)或错误码static int filemap_get_pages(struct kiocb *iocb, size_t count, struct folio_batch *fbatch, bool need_uptodate){ struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; unsigned int flags; int err = 0; /* "last_index" is the index of the page beyond the end of the read */ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);retry: if (fatal_signal_pending(current)) return -EINTR; filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } folio = fbatch->folios[folio_batch_count(fbatch) - 1]; if (folio_test_readahead(folio)) { err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } if (!folio_test_uptodate(folio)) { if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) goto err; } trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0;err: if (err < 0) folio_put(folio); if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; return err;}void *filemap_get_entry(struct address_space *mapping, pgoff_t index){ XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock();repeat: xas_reset(&xas); folio = xas_load(&xas); if (xas_retry(&xas, folio)) goto repeat; /* * A shadow entry of a recently evicted page, or a swap entry from * shmem/tmpfs. Return it without attempting to raise page count. */ if (!folio || xa_is_value(folio)) goto out; if (!folio_try_get(folio)) goto repeat; if (unlikely(folio != xas_reload(&xas))) { folio_put(folio); goto repeat; }out: rcu_read_unlock(); return folio;}void *filemap_get_entry(struct address_space *mapping, pgoff_t index){ XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock();repeat: xas_reset(&xas); folio = xas_load(&xas); if (xas_retry(&xas, folio)) goto repeat; /* * A shadow entry of a recently evicted page, or a swap entry from * shmem/tmpfs. Return it without attempting to raise page count. */ if (!folio || xa_is_value(folio)) goto out; if (!folio_try_get(folio)) goto repeat; if (unlikely(folio != xas_reload(&xas))) { folio_put(folio); goto repeat; }out: rcu_read_unlock(); return folio;}void *xas_load(struct xa_state *xas){ void *entry = xas_start(xas); while (xa_is_node(entry)) { struct xa_node *node = xa_to_node(entry); if (xas->xa_shift > node->shift) break; entry = xas_descend(xas, node); if (node->shift == 0) break; } return entry;}void *xas_load(struct xa_state *xas){ void *entry = xas_start(xas); while (xa_is_node(entry)) { struct xa_node *node = xa_to_node(entry); if (xas->xa_shift > node->shift) break; entry = xas_descend(xas, node); if (node->shift == 0) break; } return entry;}用户访问mmap内存 └─> 触发缺页中断 └─> handle_mm_fault() └─> do_fault() └─> do_read_fault() ├─> do_fault_around() [fault-around预读] └─> __do_fault() └─> vma->vm_ops->fault() └─> filemap_fault() ├─> filemap_get_folio() [查找PageCache] ├─> do_async_mmap_readahead() [异步预读] ├─> do_sync_mmap_readahead() [同步预读] ├─> __filemap_get_folio() [创建页面] ├─> lock_folio_maybe_drop_mmap() [加锁] └─> filemap_read_folio() [读取数据] └─> mapping->a_ops->read_folio()用户访问mmap内存 └─> 触发缺页中断 └─> handle_mm_fault() └─> do_fault() └─> do_read_fault() ├─> do_fault_around() [fault-around预读] └─> __do_fault() └─> vma->vm_ops->fault() └─> filemap_fault() ├─> filemap_get_folio() [查找PageCache] ├─> do_async_mmap_readahead() [异步预读] ├─> do_sync_mmap_readahead() [同步预读] ├─> __filemap_get_folio() [创建页面] ├─> lock_folio_maybe_drop_mmap() [加锁] └─> filemap_read_folio() [读取数据] └─> mapping->a_ops->read_folio()同步预读:page_cache_sync_readahead() └─> page_cache_ra_order() └─> read_pages() └─> mapping->a_ops->readahead() └─> readahead_folio() [逐个获取页面]异步预读:page_cache_async_ra() └─> page_cache_ra_order() └─> read_pages() └─> mapping->a_ops->readahead()同步预读:page_cache_sync_readahead() └─> page_cache_ra_order() └─> read_pages() └─> mapping->a_ops->readahead() └─> readahead_folio() [逐个获取页面]异步预读:page_cache_async_ra() └─> page_cache_ra_order() └─> read_pages() └─> mapping->a_ops->readahead()┌─────────────────────────────────────────────────────────────┐│ PageCache读取流程 │└─────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────┐ │ 读取请求(read/mmap) │ └─────────────────────────────────────┘ │ ┌─────────────┴─────────────┐ │ │ ▼ ▼ ┌──────────────────┐ ┌──────────────────┐ │ read()系统调用 │ │ mmap缺页中断 │ └──────────────────┘ └──────────────────┘ │ │ ▼ ▼ ┌──────────────────┐ ┌──────────────────┐ │ filemap_read() │ │ filemap_fault() │ └──────────────────┘ └──────────────────┘ │ │ └─────────────┬─────────────┘ │ ▼ ┌─────────────────────┐ │ 查找PageCache │ │ filemap_get_folio() │ └─────────────────────┘ │ ┌─────────────┴─────────────┐ │ │ ┌──────▼──────┐ ┌─────────▼─────────┐ │ 页面在缓存中 │ │ 页面不在缓存中 │ └──────┬──────┘ └─────────┬─────────┘ │ │ ┌───────────┴──────────┐ │ │ │ │ ▼ ▼ ▼┌──────────┐ ┌──────────────┐ ┌──────────────┐│ 最新页面 │ │ 非最新页面 │ │ 触发预读 ││ (uptodate)│ │ (!uptodate) │ │ readahead() │└────┬─────┘ └──────┬───────┘ └──────┬───────┘ │ │ │ │ ▼ │ │ ┌─────────────────┐ │ │ │ filemap_read_ │ │ │ │ folio() │ │ │ └────────┬────────┘ │ │ │ │ └──────────────────┴──────────────────┘ │ ▼ ┌──────────────────────┐ │ 调用文件系统read_folio│ │ mapping->a_ops-> │ │ read_folio() │ └──────────┬───────────┘ │ ▼ ┌──────────────────────┐ │ 从磁盘读取数据到页面 │ │ 设置PG_uptodate标志 │ └──────────┬───────────┘ │ ▼ ┌──────────────────────┐ │ 返回数据给用户空间 │ │ (copy_folio_to_iter) │ └──────────────────────┘┌─────────────────────────────────────────────────────────────┐│ PageCache读取流程 │└─────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────┐ │ 读取请求(read/mmap) │ └─────────────────────────────────────┘ │ ┌─────────────┴─────────────┐ │ │ ▼ ▼ ┌──────────────────┐ ┌──────────────────┐ │ read()系统调用 │ │ mmap缺页中断 │ └──────────────────┘ └──────────────────┘ │ │ ▼ ▼ ┌──────────────────┐ ┌──────────────────┐ │ filemap_read() │ │ filemap_fault() │ └──────────────────┘ └──────────────────┘ │ │ └─────────────┬─────────────┘ │ ▼ ┌─────────────────────┐ │ 查找PageCache │ │ filemap_get_folio() │ └─────────────────────┘ │ ┌─────────────┴─────────────┐ │ │ ┌──────▼──────┐ ┌─────────▼─────────┐ │ 页面在缓存中 │ │ 页面不在缓存中 │ └──────┬──────┘ └─────────┬─────────┘ │ │ ┌───────────┴──────────┐ │ │ │ │ ▼ ▼ ▼┌──────────┐ ┌──────────────┐ ┌──────────────┐│ 最新页面 │ │ 非最新页面 │ │ 触发预读 ││ (uptodate)│ │ (!uptodate) │ │ readahead() │└────┬─────┘ └──────┬───────┘ └──────┬───────┘ │ │ │ │ ▼ │ │ ┌─────────────────┐ │ │ │ filemap_read_ │ │ │ │ folio() │ │ │ └────────┬────────┘ │ │ │ │ └──────────────────┴──────────────────┘ │ ▼ ┌──────────────────────┐ │ 调用文件系统read_folio│ │ mapping->a_ops-> │ │ read_folio() │ └──────────┬───────────┘ │ ▼ ┌──────────────────────┐ │ 从磁盘读取数据到页面 │ │ 设置PG_uptodate标志 │ └──────────┬───────────┘ │ ▼ ┌──────────────────────┐ │ 返回数据给用户空间 │ │ (copy_folio_to_iter) │ └──────────────────────┘read() └─> vfs_read() └─> file->f_op->read_iter() └─> generic_file_read_iter() └─> filemap_read() ├─> filemap_get_pages() │ ├─> filemap_get_read_batch() [从PageCache查找] │ ├─> page_cache_sync_readahead() [缓存未命中时预读] │ ├─> filemap_create_folio() [创建新页面] │ │ └─> filemap_read_folio() │ │ └─> mapping->a_ops->read_folio() │ └─> filemap_update_page() [更新非最新页面] │ └─> filemap_read_folio() └─> copy_folio_to_iter() [复制到用户空间]read() └─> vfs_read() └─> file->f_op->read_iter() └─> generic_file_read_iter() └─> filemap_read() ├─> filemap_get_pages() │ ├─> filemap_get_read_batch() [从PageCache查找] │ ├─> page_cache_sync_readahead() [缓存未命中时预读] │ ├─> filemap_create_folio() [创建新页面] │ │ └─> filemap_read_folio() │ │ └─> mapping->a_ops->read_folio() │ └─> filemap_update_page() [更新非最新页面] │ └─> filemap_read_folio() └─> copy_folio_to_iter() [复制到用户空间]/** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. * @iter: Destination for the data. * @already_read: Number of bytes already read by the caller. * * Copies data from the page cache. If the data is not currently present, * uses the readahead and read_folio address_space operations to fetch it. * * Return: Total number of bytes copied, including those already read by * the caller. If an error happens before any bytes are copied, returns * a negative error number. */ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t already_read){ struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; struct folio_batch fbatch; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; loff_t last_pos = ra->prev_pos; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); folio_batch_init(&fbatch); do { cond_resched(); /* * If we've already successfully copied some data, then we * can no longer safely return -EIOCBQUEUED. Hence mark * an async read NOWAIT at that point. */ if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; error = filemap_get_pages(iocb, iter->count, &fbatch, false); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(mapping); /* * When a read accesses the same folio several times, only * mark it as accessed the first time. */ if (!pos_same_folio(iocb->ki_pos, last_pos - 1, fbatch.folios[0])) folio_mark_accessed(fbatch.folios[0]); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t fsize = folio_size(folio); size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); size_t copied; if (end_offset < folio_pos(folio)) break; if (i > 0) folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); copied = copy_folio_to_iter(folio, offset, bytes, iter); already_read += copied; iocb->ki_pos += copied; last_pos = iocb->ki_pos; if (copied < bytes) { error = -EFAULT; break; } }put_folios: for (i = 0; i < folio_batch_count(&fbatch); i++) folio_put(fbatch.folios[i]); folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); ra->prev_pos = last_pos; return already_read ? already_read : error;}/** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. * @iter: Destination for the data. * @already_read: Number of bytes already read by the caller. * * Copies data from the page cache. If the data is not currently present, * uses the readahead and read_folio address_space operations to fetch it. * * Return: Total number of bytes copied, including those already read by * the caller. If an error happens before any bytes are copied, returns * a negative error number. */ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t already_read){ struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; struct folio_batch fbatch; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; loff_t last_pos = ra->prev_pos; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); folio_batch_init(&fbatch); do { cond_resched(); /* * If we've already successfully copied some data, then we * can no longer safely return -EIOCBQUEUED. Hence mark * an async read NOWAIT at that point. */ if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; error = filemap_get_pages(iocb, iter->count, &fbatch, false); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(mapping); /* * When a read accesses the same folio several times, only * mark it as accessed the first time. */ if (!pos_same_folio(iocb->ki_pos, last_pos - 1, fbatch.folios[0])) folio_mark_accessed(fbatch.folios[0]); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t fsize = folio_size(folio); size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); size_t copied; if (end_offset < folio_pos(folio)) break; if (i > 0) folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); copied = copy_folio_to_iter(folio, offset, bytes, iter); already_read += copied; iocb->ki_pos += copied; last_pos = iocb->ki_pos; if (copied < bytes) { error = -EFAULT; break; } }put_folios: for (i = 0; i < folio_batch_count(&fbatch); i++) folio_put(fbatch.folios[i]); folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); ra->prev_pos = last_pos; return already_read ? already_read : error;}static int filemap_get_pages(struct kiocb *iocb, size_t count, struct folio_batch *fbatch, bool need_uptodate){ struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; unsigned int flags; int err = 0; /* "last_index" is the index of the page beyond the end of the read */ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);retry: if (fatal_signal_pending(current)) return -EINTR; filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } folio = fbatch->folios[folio_batch_count(fbatch) - 1]; if (folio_test_readahead(folio)) { err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } if (!folio_test_uptodate(folio)) { if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) goto err; } trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0;err: if (err < 0) folio_put(folio); if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; return err;}/* * filemap_get_read_batch - Get a batch of folios for read * * Get a batch of folios which represent a contiguous range of bytes in * the file. No exceptional entries will be returned. If @index is in * the middle of a folio, the entire folio will be returned. The last * folio in the batch may have the readahead flag set or the uptodate flag * clear so that the caller can take the appropriate action. */ /* 使用RCU从radix tree批量获取页面 遇到非最新或预读标记页面时停止 返回连续范围的页面批次 */static void filemap_get_read_batch(struct address_space *mapping, pgoff_t index, pgoff_t max, struct folio_batch *fbatch){ XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock(); for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; if (xas.xa_index > max || xa_is_value(folio)) break; if (xa_is_sibling(folio)) break; if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) goto put_folio; if (!folio_batch_add(fbatch, folio)) break; if (!folio_test_uptodate(folio)) break; if (folio_test_readahead(folio)) break; xas_advance(&xas, folio_next_index(folio) - 1); continue;put_folio: folio_put(folio);retry: xas_reset(&xas); } rcu_read_unlock();}static int filemap_get_pages(struct kiocb *iocb, size_t count, struct folio_batch *fbatch, bool need_uptodate){ struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; unsigned int flags; int err = 0; /* "last_index" is the index of the page beyond the end of the read */ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);retry: if (fatal_signal_pending(current)) return -EINTR; filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } folio = fbatch->folios[folio_batch_count(fbatch) - 1]; if (folio_test_readahead(folio)) { err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } if (!folio_test_uptodate(folio)) { if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) goto err; } trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0;err: if (err < 0)[培训]Windows内核深度攻防:从Hook技术到Rootkit实战!