首页
社区
课程
招聘
[原创] PageCache详细分析(读写/写回) 基于 Linux 6.12.32版本
发表于: 2025-12-4 17:04 2155

[原创] PageCache详细分析(读写/写回) 基于 Linux 6.12.32版本

2025-12-4 17:04
2155

参考文章:

总览(概念上讲解):62cK9s2c8@1M7s2y4Q4x3@1q4Q4x3V1k6Q4x3V1k6*7K9s2g2S2L8X3I4S2L8W2)9J5k6i4A6Z5K9h3S2#2i4K6u0W2j5$3!0E0i4K6u0r3M7q4)9J5c8U0b7K6y4U0x3I4x3K6V1H3z5l9`.`.

PageCache的产生释放及优化:c7aK9s2c8@1M7s2y4Q4x3@1q4Q4x3V1k6Q4x3V1k6%4N6%4N6Q4x3X3g2U0L8X3u0D9L8$3N6K6i4K6u0W2j5$3!0E0i4K6u0r3M7$3!0X3N6r3y4D9L8%4g2V1i4K6u0r3M7q4)9J5c8U0p5$3y4U0b7&6x3K6t1$3i4K6u0W2K9s2c8E0L8l9`.`.

address_space结构解析:688K9s2c8@1M7s2y4Q4x3@1q4Q4x3V1k6Q4x3V1k6*7K9s2g2S2L8X3I4S2L8W2)9J5k6i4A6Z5K9h3S2#2i4K6u0W2j5$3!0E0i4K6u0r3M7q4)9J5c8U0f1@1x3o6R3H3y4U0R3%4y4R3`.`.

xarray详细解释(很厉害):c8dK9s2c8@1M7s2y4Q4x3@1q4Q4x3V1k6Q4x3V1k6*7K9s2g2S2L8X3I4S2L8W2)9J5k6i4A6Z5K9h3S2#2i4K6u0W2j5$3!0E0i4K6u0r3M7q4)9J5c8U0f1^5y4K6p5^5y4o6j5J5x3H3`.`.

本文较长,过程较为详细,请耐心阅读。本人水平有限,如果出现什么错误,请各位大佬指出。

本文分析的版本均为: Linux6.12.32

I_pages 的数据结构,用于管理PagesCaches。

XArray 是基于 Radix Tree(基数树)的实现。在 Linux 内核中 xarray 相比于树使用体验感更类似于一个无限增长的数组。

Node 中 slots 存放的 Entry ,有三种基本类型。

指向实际数据或子节点。

指向下一层 xa_node

标记节点正在被修改或即将释放,提示 RCU 读者重试

值条目,用于存储 swap/shadow 条目或标记指针。

现代Linux内核中,bdflush已被per-BDI(Backing Device Info)的写回线程替代。每个BDI有一个或多个bdi_writeback结构,每个bdi_writeback对应一个写回线程(通过workqueue实现)。通过这个机制来负责的 Linux 的脏数据回写。

Address_space 的创建(除 Swap Cache)主要分为两个板块(核心函数 alloc_inode)

路径1/2/3简述是下放的调用链

核心主要是调用到:inode_init_always函数

注意:inode_init_always_gfp 只初始化了 address_space 的基础字段,i_pages 的初始化在 inode_init_once() 中完成。

分配 inode 结构体,对 inode 进行初始化。这里我们可以很清晰看见,这里通过 ops->alloc_inode(sb)/alloc_inode_sb 进行 inode 结构体的获取,然后通过 inode_init_always 对 inode 进行初始化

i_pages 的初始化只有一个路径在(alloc_inode 内部触发):通过 inode_init_once() → __address_space_init_once()。

xa_init_flags() 的作用:

inode_init_once() 作为 slab 分配器的构造函数,在从 slab 分配 inode 时自动调用:

初始化 address_speace 的一些基础属性

其实很好理解,获取文件偏移后比如index:0x00100010,那么从root entry开始用shift开始提取slots的下标

比如: index: 0x010010,先获取根 enrty ,shift 为 12

继续下降,然后直到遇到了pointer entry/value entry/null entry 就返回。

无论是mmap映射文件还是通过open直接打开文件然后读取,都是拿到index后去查找PageCaches,如果没有命中PageCaches就创建 page ,再进行读取。

和读取一样,有多种路径

脏数据回写会在后文单独给出

本质上这些不同路径都是在向 BDI 提交任务,或者直接调用 bdi 的写回函数

功能:创建全局写回工作队列bdi_wq,用于执行所有写回工作。

#define FGP_ACCESSED            ((__force fgf_t)0x00000001)
#define FGP_LOCK                ((__force fgf_t)0x00000002)
#define FGP_CREAT               ((__force fgf_t)0x00000004)
#define FGP_WRITE               ((__force fgf_t)0x00000008)
#define FGP_NOFS                ((__force fgf_t)0x00000010)
#define FGP_NOWAIT              ((__force fgf_t)0x00000020)
#define FGP_FOR_MMAP            ((__force fgf_t)0x00000040)
#define FGP_STABLE              ((__force fgf_t)0x00000080)
#define FGF_GET_ORDER(fgf)      (((__force unsigned)fgf) >> 26) /* top 6 bits */
 
#define FGP_WRITEBEGIN          (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
#define FGP_ACCESSED            ((__force fgf_t)0x00000001)
#define FGP_LOCK                ((__force fgf_t)0x00000002)
#define FGP_CREAT               ((__force fgf_t)0x00000004)
#define FGP_WRITE               ((__force fgf_t)0x00000008)
#define FGP_NOFS                ((__force fgf_t)0x00000010)
#define FGP_NOWAIT              ((__force fgf_t)0x00000020)
#define FGP_FOR_MMAP            ((__force fgf_t)0x00000040)
#define FGP_STABLE              ((__force fgf_t)0x00000080)
#define FGF_GET_ORDER(fgf)      (((__force unsigned)fgf) >> 26) /* top 6 bits */
 
#define FGP_WRITEBEGIN          (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
struct address_space {
    struct inode        *host;              // 指向所属的inode
    struct xarray       i_pages;            // XArray:存储页缓存的radix tree
    struct rw_semaphore invalidate_lock;    // 保护页面失效操作的读写锁
    gfp_t           gfp_mask;           // 页面分配的内存标志
    atomic_t        i_mmap_writable;     // 可写内存映射计数
    struct rb_root_cached   i_mmap;            // 内存映射的RB树(用于跟踪VMA)
    unsigned long       nrpages;            // 页缓存中的页面总数
    pgoff_t         writeback_index;     // 写回操作的起始索引
    const struct address_space_operations *a_ops;  // 地址空间操作函数集
    unsigned long       flags;              // 标志位(AS_*)
    errseq_t        wb_err;             // 写回错误序列号
    spinlock_t      i_private_lock;      // 私有数据保护锁
    struct list_head    i_private_list;      // 私有数据链表
    struct rw_semaphore i_mmap_rwsem;       // 保护i_mmap的读写锁
    void *          i_private_data;      // 文件系统私有数据指针
}
struct address_space {
    struct inode        *host;              // 指向所属的inode
    struct xarray       i_pages;            // XArray:存储页缓存的radix tree
    struct rw_semaphore invalidate_lock;    // 保护页面失效操作的读写锁
    gfp_t           gfp_mask;           // 页面分配的内存标志
    atomic_t        i_mmap_writable;     // 可写内存映射计数
    struct rb_root_cached   i_mmap;            // 内存映射的RB树(用于跟踪VMA)
    unsigned long       nrpages;            // 页缓存中的页面总数
    pgoff_t         writeback_index;     // 写回操作的起始索引
    const struct address_space_operations *a_ops;  // 地址空间操作函数集
    unsigned long       flags;              // 标志位(AS_*)
    errseq_t        wb_err;             // 写回错误序列号
    spinlock_t      i_private_lock;      // 私有数据保护锁
    struct list_head    i_private_list;      // 私有数据链表
    struct rw_semaphore i_mmap_rwsem;       // 保护i_mmap的读写锁
    void *          i_private_data;      // 文件系统私有数据指针
}
struct xarray {
        spinlock_t      xa_lock;
/* private: The rest of the data structure is not to be used directly. */
        gfp_t           xa_flags;
        void __rcu *    xa_head;
};
struct xarray {
        spinlock_t      xa_lock;
/* private: The rest of the data structure is not to be used directly. */
        gfp_t           xa_flags;
        void __rcu *    xa_head;
};
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT      (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6)
#endif
#define XA_CHUNK_SIZE       (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK       (XA_CHUNK_SIZE - 1)
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT      (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6)
#endif
#define XA_CHUNK_SIZE       (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK       (XA_CHUNK_SIZE - 1)
XArray树结构(3级示例):
 
                    xa_head
                      
                      
                  [xa_node (shift=12)]
                      
        ┌─────────────┼─────────────┐
        │             │             │
    slots[0]      slots[1]      slots[2]
        │             │             │
        ▼             ▼             ▼
   [xa_node]      [xa_node]      NULL
   (shift=6)      (shift=6)
        │             │
    ┌───┼───┐     ┌───┼───┐
    │   │   │     │   │   │
 slots[0] slots[1] slots[2] slots[3]
    │   │   │     │   │   │
    ▼   ▼   ▼     ▼   ▼   ▼
  folio folio folio folio folio
   (0)   (1)   (2)   (3)   (4)
 
索引计算:
- index = 0: 0 >> 12 = 0, 0 >> 6 = 0, 0 & 63 = 0 → slots[0][0]
- index = 1: 1 >> 12 = 0, 1 >> 6 = 0, 1 & 63 = 1 → slots[0][1]
- index = 64: 64 >> 12 = 0, 64 >> 6 = 1, 64 & 63 = 0 → slots[1][0]
XArray树结构(3级示例):
 
                    xa_head
                      
                      
                  [xa_node (shift=12)]
                      
        ┌─────────────┼─────────────┐
        │             │             │
    slots[0]      slots[1]      slots[2]
        │             │             │
        ▼             ▼             ▼
   [xa_node]      [xa_node]      NULL
   (shift=6)      (shift=6)
        │             │
    ┌───┼───┐     ┌───┼───┐
    │   │   │     │   │   │
 slots[0] slots[1] slots[2] slots[3]
    │   │   │     │   │   │
    ▼   ▼   ▼     ▼   ▼   ▼
  folio folio folio folio folio
   (0)   (1)   (2)   (3)   (4)
 
索引计算:
- index = 0: 0 >> 12 = 0, 0 >> 6 = 0, 0 & 63 = 0 → slots[0][0]
- index = 1: 1 >> 12 = 0, 1 >> 6 = 0, 1 & 63 = 1 → slots[0][1]
- index = 64: 64 >> 12 = 0, 64 >> 6 = 1, 64 & 63 = 0 → slots[1][0]
struct xa_node {
    unsigned char   shift;      /* Bits remaining in each slot */
    unsigned char   offset;     /* Slot offset in parent */
    unsigned char   count;      /* Total entry count */
    unsigned char   nr_values;  /* Value entry count */
    struct xa_node __rcu *parent;   /* NULL at top of tree */
    struct xarray   *array;     /* The array we belong to */
    union {
        struct list_head private_list;  /* For tree user */
        struct rcu_head rcu_head;   /* Used when freeing node */
    };
    void __rcu  *slots[XA_CHUNK_SIZE];
    union {
        unsigned long   tags[XA_MAX_MARKS][XA_MARK_LONGS];
        unsigned long   marks[XA_MAX_MARKS][XA_MARK_LONGS];
    };
};
struct xa_node {
    unsigned char   shift;      /* Bits remaining in each slot */
    unsigned char   offset;     /* Slot offset in parent */
    unsigned char   count;      /* Total entry count */
    unsigned char   nr_values;  /* Value entry count */
    struct xa_node __rcu *parent;   /* NULL at top of tree */
    struct xarray   *array;     /* The array we belong to */
    union {
        struct list_head private_list;  /* For tree user */
        struct rcu_head rcu_head;   /* Used when freeing node */
    };
    void __rcu  *slots[XA_CHUNK_SIZE];
    union {
        unsigned long   tags[XA_MAX_MARKS][XA_MARK_LONGS];
        unsigned long   marks[XA_MAX_MARKS][XA_MARK_LONGS];
    };
};
Entry (void *)
├─ [00] Pointer Entry (普通指针条目)
│   ├─ NULL Entry (空指针)
│   └─ 普通指针 (指向实际数据对象)
├─ [10] Internal Entry (内部条目)
│   ├─ Node Pointer (>4096)
│   ├─ Sibling Entry (0-62)
│   ├─ Retry Entry (256)
│   ├─ Zero Entry (257)
│   └─ Error Entry (-4094 ~ -2)
└─ [x1] Value Entry (值条目)
    ├─ 纯值条目 (低1位=1, 低2位=01)
    └─ Tagged Pointer (低2位=11)
Entry (void *)
├─ [00] Pointer Entry (普通指针条目)
│   ├─ NULL Entry (空指针)
│   └─ 普通指针 (指向实际数据对象)
├─ [10] Internal Entry (内部条目)
│   ├─ Node Pointer (>4096)
│   ├─ Sibling Entry (0-62)
│   ├─ Retry Entry (256)
│   ├─ Zero Entry (257)
│   └─ Error Entry (-4094 ~ -2)
└─ [x1] Value Entry (值条目)
    ├─ 纯值条目 (低1位=1, 低2位=01)
    └─ Tagged Pointer (低2位=11)
// NULL指针本身就是NULL entry
void *entry = NULL;
// NULL指针本身就是NULL entry
void *entry = NULL;
// 示例:指向struct folio的指针
void *entry = folio;  // 低2位 = 00
// 示例:指向struct folio的指针
void *entry = folio;  // 低2位 = 00
static inline void *xa_mk_node(const struct xa_node *node)
{
    return (void *)((unsigned long)node | 2);
}
 
/* Private */
static inline struct xa_node *xa_to_node(const void *entry)
{
    return (struct xa_node *)((unsigned long)entry - 2);
}
static inline void *xa_mk_node(const struct xa_node *node)
{
    return (void *)((unsigned long)node | 2);
}
 
/* Private */
static inline struct xa_node *xa_to_node(const void *entry)
{
    return (struct xa_node *)((unsigned long)entry - 2);
}
static inline bool xa_is_node(const void *entry)
{
    return xa_is_internal(entry) && (unsigned long)entry > 4096;
}
static inline bool xa_is_node(const void *entry)
{
    return xa_is_internal(entry) && (unsigned long)entry > 4096;
}
假设一个64KB的大页面(order=4),占用16个4KB页面的索引:
 
索引100-115都指向同一个folio
 
存储方式:
  i_pages[100] = folio指针  ← 规范槽位(canonical slot)
  i_pages[101] = sibling(100)  ← 指向槽位100
  i_pages[102] = sibling(100)
  ...
  i_pages[115] = sibling(100)
 
查找索引103时:
  xas_descend() → 发现sibling(100)
  → 跳转到槽位100 → 返回folio指针
// 创建函数
static inline void *xa_mk_sibling(unsigned int offset)
{
    return xa_mk_internal(offset);
}
// 识别函数
static inline bool xa_is_sibling(const void *entry)
{
    return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
        (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
}
// 提取函数
static inline unsigned long xa_to_sibling(const void *entry)
{
    return xa_to_internal(entry);
}
假设一个64KB的大页面(order=4),占用16个4KB页面的索引:
 
索引100-115都指向同一个folio
 
存储方式:
  i_pages[100] = folio指针  ← 规范槽位(canonical slot)
  i_pages[101] = sibling(100)  ← 指向槽位100
  i_pages[102] = sibling(100)
  ...
  i_pages[115] = sibling(100)
 
查找索引103时:
  xas_descend() → 发现sibling(100)
  → 跳转到槽位100 → 返回folio指针
// 创建函数
static inline void *xa_mk_sibling(unsigned int offset)
{
    return xa_mk_internal(offset);
}
// 识别函数
static inline bool xa_is_sibling(const void *entry)
{
    return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
        (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
}
// 提取函数
static inline unsigned long xa_to_sibling(const void *entry)
{
    return xa_to_internal(entry);
}
#define XA_RETRY_ENTRY      xa_mk_internal(256)
 
/**
 * xa_is_retry() - Is the entry a retry entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a retry entry.
 */
static inline bool xa_is_retry(const void *entry)
{
    return unlikely(entry == XA_RETRY_ENTRY);
}
#define XA_RETRY_ENTRY      xa_mk_internal(256)
 
/**
 * xa_is_retry() - Is the entry a retry entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a retry entry.
 */
static inline bool xa_is_retry(const void *entry)
{
    return unlikely(entry == XA_RETRY_ENTRY);
}
#define XA_ZERO_ENTRY       xa_mk_internal(257)
 
/**
 * xa_is_zero() - Is the entry a zero entry?
 * @entry: Entry retrieved from the XArray
 *
 * The normal API will return NULL as the contents of a slot containing
 * a zero entry.  You can only see zero entries by using the advanced API.
 *
 * Return: %true if the entry is a zero entry.
 */
static inline bool xa_is_zero(const void *entry)
{
    return unlikely(entry == XA_ZERO_ENTRY);
}
static inline __must_check
int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
{
    return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}
#define XA_ZERO_ENTRY       xa_mk_internal(257)
 
/**
 * xa_is_zero() - Is the entry a zero entry?
 * @entry: Entry retrieved from the XArray
 *
 * The normal API will return NULL as the contents of a slot containing
 * a zero entry.  You can only see zero entries by using the advanced API.
 *
 * Return: %true if the entry is a zero entry.
 */
static inline bool xa_is_zero(const void *entry)
{
    return unlikely(entry == XA_ZERO_ENTRY);
}
static inline __must_check
int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
{
    return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}
static inline bool xa_is_err(const void *entry)
{
    return unlikely(xa_is_internal(entry) &&
            entry >= xa_mk_internal(-MAX_ERRNO));
}
static inline bool xa_is_err(const void *entry)
{
    return unlikely(xa_is_internal(entry) &&
            entry >= xa_mk_internal(-MAX_ERRNO));
}
static inline void *xa_mk_value(unsigned long v)
{
    WARN_ON((long)v < 0);
    return (void *)((v << 1) | 1);
}
 
/**
 * xa_to_value() - Get value stored in an XArray entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value stored in the XArray entry.
 */
static inline unsigned long xa_to_value(const void *entry)
{
    return (unsigned long)entry >> 1;
}
 
/**
 * xa_is_value() - Determine if an entry is a value.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: True if the entry is a value, false if it is a pointer.
 */
static inline bool xa_is_value(const void *entry)
{
    return (unsigned long)entry & 1;
}
// 列子
// 创建值条目
void *entry = xa_mk_value(123);  // 将整数123编码为值条目
// 提取值
unsigned long value = xa_to_value(entry);  // 返回123
// 判断是否为值条目
if (xa_is_value(entry)) {
    // 这是值条目,不是指针
}
static inline void *xa_mk_value(unsigned long v)
{
    WARN_ON((long)v < 0);
    return (void *)((v << 1) | 1);
}
 
/**
 * xa_to_value() - Get value stored in an XArray entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value stored in the XArray entry.
 */
static inline unsigned long xa_to_value(const void *entry)
{
    return (unsigned long)entry >> 1;
}
 
/**
 * xa_is_value() - Determine if an entry is a value.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: True if the entry is a value, false if it is a pointer.
 */
static inline bool xa_is_value(const void *entry)
{
    return (unsigned long)entry & 1;
}
// 列子
// 创建值条目
void *entry = xa_mk_value(123);  // 将整数123编码为值条目
// 提取值
unsigned long value = xa_to_value(entry);  // 返回123
// 判断是否为值条目
if (xa_is_value(entry)) {
    // 这是值条目,不是指针
}
struct bdi_writeback {
        struct backing_dev_info *bdi;   /* our parent bdi */
 
        unsigned long state;            /* Always use atomic bitops on this */
        unsigned long last_old_flush;   /* last old data flush */
 
        struct list_head b_dirty;       /* dirty inodes */
        struct list_head b_io;          /* parked for writeback */
        struct list_head b_more_io;     /* parked for more writeback */
        struct list_head b_dirty_time;  /* time stamps are dirty */
        spinlock_t list_lock;           /* protects the b_* lists */
 
        atomic_t writeback_inodes;      /* number of inodes under writeback */
        struct percpu_counter stat[NR_WB_STAT_ITEMS];
 
        unsigned long bw_time_stamp;    /* last time write bw is updated */
        unsigned long dirtied_stamp;
        unsigned long written_stamp;    /* pages written at bw_time_stamp */
        unsigned long write_bandwidth;  /* the estimated write bandwidth */
        unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */
 
        /*
         * The base dirty throttle rate, re-calculated on every 200ms.
         * All the bdi tasks' dirty rate will be curbed under it.
         * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
         * in small steps and is much more smooth/stable than the latter.
         */
        unsigned long dirty_ratelimit;
        unsigned long balanced_dirty_ratelimit;
 
        struct fprop_local_percpu completions;
        int dirty_exceeded;
        enum wb_reason start_all_reason;
 
        spinlock_t work_lock;           /* protects work_list & dwork scheduling */
        struct list_head work_list;
        struct delayed_work dwork;      /* work item used for writeback */
        struct delayed_work bw_dwork;   /* work item used for bandwidth estimate */
 
        struct list_head bdi_node;      /* anchored at bdi->wb_list */
 
#ifdef CONFIG_CGROUP_WRITEBACK
        struct percpu_ref refcnt;       /* used only for !root wb's */
        struct fprop_local_percpu memcg_completions;
        struct cgroup_subsys_state *memcg_css; /* the associated memcg */
        struct cgroup_subsys_state *blkcg_css; /* and blkcg */
        struct list_head memcg_node;    /* anchored at memcg->cgwb_list */
        struct list_head blkcg_node;    /* anchored at blkcg->cgwb_list */
        struct list_head b_attached;    /* attached inodes, protected by list_lock */
        struct list_head offline_node;  /* anchored at offline_cgwbs */
 
        union {
                struct work_struct release_work;
                struct rcu_head rcu;
        };
#endif
};
struct bdi_writeback {
        struct backing_dev_info *bdi;   /* our parent bdi */
 
        unsigned long state;            /* Always use atomic bitops on this */
        unsigned long last_old_flush;   /* last old data flush */
 
        struct list_head b_dirty;       /* dirty inodes */
        struct list_head b_io;          /* parked for writeback */
        struct list_head b_more_io;     /* parked for more writeback */
        struct list_head b_dirty_time;  /* time stamps are dirty */
        spinlock_t list_lock;           /* protects the b_* lists */
 
        atomic_t writeback_inodes;      /* number of inodes under writeback */
        struct percpu_counter stat[NR_WB_STAT_ITEMS];
 
        unsigned long bw_time_stamp;    /* last time write bw is updated */
        unsigned long dirtied_stamp;
        unsigned long written_stamp;    /* pages written at bw_time_stamp */
        unsigned long write_bandwidth;  /* the estimated write bandwidth */
        unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */
 
        /*
         * The base dirty throttle rate, re-calculated on every 200ms.
         * All the bdi tasks' dirty rate will be curbed under it.
         * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
         * in small steps and is much more smooth/stable than the latter.
         */
        unsigned long dirty_ratelimit;
        unsigned long balanced_dirty_ratelimit;
 
        struct fprop_local_percpu completions;
        int dirty_exceeded;
        enum wb_reason start_all_reason;
 
        spinlock_t work_lock;           /* protects work_list & dwork scheduling */
        struct list_head work_list;
        struct delayed_work dwork;      /* work item used for writeback */
        struct delayed_work bw_dwork;   /* work item used for bandwidth estimate */
 
        struct list_head bdi_node;      /* anchored at bdi->wb_list */
 
#ifdef CONFIG_CGROUP_WRITEBACK
        struct percpu_ref refcnt;       /* used only for !root wb's */
        struct fprop_local_percpu memcg_completions;
        struct cgroup_subsys_state *memcg_css; /* the associated memcg */
        struct cgroup_subsys_state *blkcg_css; /* and blkcg */
        struct list_head memcg_node;    /* anchored at memcg->cgwb_list */
        struct list_head blkcg_node;    /* anchored at blkcg->cgwb_list */
        struct list_head b_attached;    /* attached inodes, protected by list_lock */
        struct list_head offline_node;  /* anchored at offline_cgwbs */
 
        union {
                struct work_struct release_work;
                struct rcu_head rcu;
        };
#endif
};
struct wb_writeback_work {
    long nr_pages;
    struct super_block *sb;
    enum writeback_sync_modes sync_mode;
    unsigned int tagged_writepages:1;
    unsigned int for_kupdate:1;
    unsigned int range_cyclic:1;
    unsigned int for_background:1;
    unsigned int for_sync:1;    /* sync(2) WB_SYNC_ALL writeback */
    unsigned int auto_free:1;   /* free on completion */
    enum wb_reason reason;      /* why was writeback initiated? */
 
    struct list_head list;      /* pending work list */
    struct wb_completion *done; /* set if the caller waits */
};
struct wb_writeback_work {
    long nr_pages;
    struct super_block *sb;
    enum writeback_sync_modes sync_mode;
    unsigned int tagged_writepages:1;
    unsigned int for_kupdate:1;
    unsigned int range_cyclic:1;
    unsigned int for_background:1;
    unsigned int for_sync:1;    /* sync(2) WB_SYNC_ALL writeback */
    unsigned int auto_free:1;   /* free on completion */
    enum wb_reason reason;      /* why was writeback initiated? */
 
    struct list_head list;      /* pending work list */
    struct wb_completion *done; /* set if the caller waits */
};
alloc_inode(sb)
  
  ├─> ops->alloc_inode(sb) 或 alloc_inode_sb()
  │   └─> kmem_cache_alloc(ext4_inode_cachep, GFP_KERNEL)
  │       │
  │       └─> [slab分配器内部]
  │           └─> 如果是新分配的对象:
  │               └─> init_once(foo)  [slab构造函数]
  │                   └─> inode_init_once(inode)
  │                       ├─> memset(inode, 0, ...)  // 清零整个结构体
  │                       ├─> 初始化各种链表头
  │                       └─> __address_space_init_once(&inode->i_data)
  │                           └─> xa_init_flags(&mapping->i_pages, ...)  // ✅ i_pages初始化
  
  └─> inode_init_always(sb, inode)
      └─> inode_init_always_gfp(sb, inode, GFP_NOFS)
          ├─> 初始化inode基础字段
          └─> 初始化address_space运行时属性
              ├─> mapping->a_ops = &empty_aops
              ├─> mapping->host = inode
              ├─> mapping_set_gfp_mask(...)
              └─> inode->i_mapping = mapping
alloc_inode(sb)
  
  ├─> ops->alloc_inode(sb) 或 alloc_inode_sb()
  │   └─> kmem_cache_alloc(ext4_inode_cachep, GFP_KERNEL)
  │       │
  │       └─> [slab分配器内部]
  │           └─> 如果是新分配的对象:
  │               └─> init_once(foo)  [slab构造函数]
  │                   └─> inode_init_once(inode)
  │                       ├─> memset(inode, 0, ...)  // 清零整个结构体
  │                       ├─> 初始化各种链表头
  │                       └─> __address_space_init_once(&inode->i_data)
  │                           └─> xa_init_flags(&mapping->i_pages, ...)  // ✅ i_pages初始化
  
  └─> inode_init_always(sb, inode)
      └─> inode_init_always_gfp(sb, inode, GFP_NOFS)
          ├─> 初始化inode基础字段
          └─> 初始化address_space运行时属性
              ├─> mapping->a_ops = &empty_aops
              ├─> mapping->host = inode
              ├─> mapping_set_gfp_mask(...)
              └─> inode->i_mapping = mapping
场景1:ext4文件系统挂载,创建根inode
  └─> ext4_iget(sb, EXT4_ROOT_INO, ...)
      └─> iget_locked(sb, ino)
          └─> alloc_inode(sb)
              └─> ext4_alloc_inode()
                  └─> kmem_cache_alloc(ext4_inode_cachep)
                      └─> init_once()被调用(slab构造函数)
                          └─> inode_init_once()
                              └─> __address_space_init_once()
                                  └─> xa_init_flags(&i_pages, ...)  // i_pages初始化
               
              └─> inode_init_always(sb, inode)
                  └─> inode_init_always_gfp()
                      └─> 初始化address_space其他字段
                          ├─> mapping->a_ops = &empty_aops
                          ├─> mapping->host = inode
                          ├─> mapping_set_gfp_mask(...)
                          └─> inode->i_mapping = mapping
 
场景2:创建新文件
  └─> ext4_new_inode()
      └─> new_inode(sb)
          └─> alloc_inode(sb)
              └─> 同上,init_once()初始化i_pages
              └─> inode_init_always()初始化其他字段
 
场景3:打开已存在文件
  └─> ext4_iget()
      └─> iget_locked()
          └─> 如果inode不在缓存中:
              └─> alloc_inode()
                  └─> 同上流程
场景1:ext4文件系统挂载,创建根inode
  └─> ext4_iget(sb, EXT4_ROOT_INO, ...)
      └─> iget_locked(sb, ino)
          └─> alloc_inode(sb)
              └─> ext4_alloc_inode()
                  └─> kmem_cache_alloc(ext4_inode_cachep)
                      └─> init_once()被调用(slab构造函数)
                          └─> inode_init_once()
                              └─> __address_space_init_once()
                                  └─> xa_init_flags(&i_pages, ...)  // i_pages初始化
               
              └─> inode_init_always(sb, inode)
                  └─> inode_init_always_gfp()
                      └─> 初始化address_space其他字段
                          ├─> mapping->a_ops = &empty_aops
                          ├─> mapping->host = inode
                          ├─> mapping_set_gfp_mask(...)
                          └─> inode->i_mapping = mapping
 
场景2:创建新文件
  └─> ext4_new_inode()
      └─> new_inode(sb)
          └─> alloc_inode(sb)
              └─> 同上,init_once()初始化i_pages
              └─> inode_init_always()初始化其他字段
 
场景3:打开已存在文件
  └─> ext4_iget()
      └─> iget_locked()
          └─> 如果inode不在缓存中:
              └─> alloc_inode()
                  └─> 同上流程
用户空间:open("newfile", O_CREAT)
  └─> open系统调用
      └─> do_sys_open() [fs/open.c]
          └─> do_filp_open() [fs/namei.c]
              └─> path_openat() [fs/namei.c]
                  └─> do_open() [fs/namei.c]
                      └─> vfs_create() [fs/namei.c]
                          └─> ext4_create() [fs/ext4/namei.c]
                              └─> ext4_new_inode_start_handle() [fs/ext4/namei.c]
                                  └─> __ext4_new_inode() [fs/ext4/ialloc.c:924]
                                      ├─> new_inode(sb) [fs/inode.c:1121]
                                      │   └─> new_inode_pseudo(sb)
                                      │       └─> alloc_inode(sb)
                                      │           └─> ext4_alloc_inode()
                                      │               └─> kmem_cache_alloc(ext4_inode_cachep)
                                      │                   └─> 分配时调用init_once回调
                                      │                       └─> inode_init_once() [fs/inode.c:424]
                                      │                           └─> __address_space_init_once(&inode->i_data)
                                      │                               └─> xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT)
                                      │                                   └─> i_pages初始化完成!
                                      
                                      └─> inode_init_always(sb, inode)
                                          └─> 初始化address_space的其他字段
用户空间:open("newfile", O_CREAT)
  └─> open系统调用
      └─> do_sys_open() [fs/open.c]
          └─> do_filp_open() [fs/namei.c]
              └─> path_openat() [fs/namei.c]
                  └─> do_open() [fs/namei.c]
                      └─> vfs_create() [fs/namei.c]
                          └─> ext4_create() [fs/ext4/namei.c]
                              └─> ext4_new_inode_start_handle() [fs/ext4/namei.c]
                                  └─> __ext4_new_inode() [fs/ext4/ialloc.c:924]
                                      ├─> new_inode(sb) [fs/inode.c:1121]
                                      │   └─> new_inode_pseudo(sb)
                                      │       └─> alloc_inode(sb)
                                      │           └─> ext4_alloc_inode()
                                      │               └─> kmem_cache_alloc(ext4_inode_cachep)
                                      │                   └─> 分配时调用init_once回调
                                      │                       └─> inode_init_once() [fs/inode.c:424]
                                      │                           └─> __address_space_init_once(&inode->i_data)
                                      │                               └─> xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT)
                                      │                                   └─> i_pages初始化完成!
                                      
                                      └─> inode_init_always(sb, inode)
                                          └─> 初始化address_space的其他字段
用户空间:open("existing_file", O_RDONLY)
  └─> open系统调用
      └─> do_sys_open()
          └─> do_filp_open()
              └─> path_openat()
                  └─> do_open()
                      └─> vfs_open()
                          └─> d_inode(path->dentry)
                              └─> 如果inode不在缓存中:
                                  └─> ext4_lookup() [fs/ext4/namei.c]
                                      └─> ext4_iget(sb, ino, EXT4_IGET_NORMAL)
                                          └─> __ext4_iget()
                                              └─> iget_locked(sb, ino)
                                                  └─> alloc_inode(sb)
                                                      └─> 同上,通过slab分配器分配
                                                          └─> inode_init_once()被调用
                                                              └─> i_pages初始化
用户空间:open("existing_file", O_RDONLY)
  └─> open系统调用
      └─> do_sys_open()
          └─> do_filp_open()
              └─> path_openat()
                  └─> do_open()
                      └─> vfs_open()
                          └─> d_inode(path->dentry)
                              └─> 如果inode不在缓存中:
                                  └─> ext4_lookup() [fs/ext4/namei.c]
                                      └─> ext4_iget(sb, ino, EXT4_IGET_NORMAL)
                                          └─> __ext4_iget()
                                              └─> iget_locked(sb, ino)
                                                  └─> alloc_inode(sb)
                                                      └─> 同上,通过slab分配器分配
                                                          └─> inode_init_once()被调用
                                                              └─> i_pages初始化
内核启动或添加swap分区
  └─> swapon系统调用
      └─> do_swapoff() [mm/swapfile.c]
          └─> setup_swap_extents() [mm/swapfile.c]
              └─> init_swap_address_space() [mm/swap_state.c:710]
                  ├─> 计算需要的address_space数量
                  │   └─> nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES)
                  ├─> kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL)
                  │   └─> 分配多个address_space结构
                  └─> 对每个address_space初始化:
                      ├─> xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ)
                      │   └─> 注意:没有XA_FLAGS_ACCOUNT标志
                      ├─> atomic_set(&space->i_mmap_writable, 0)
                      ├─> space->a_ops = &swap_aops
                      └─> mapping_set_no_writeback_tags(space)
                          └─> swap cache不使用writeback相关标记
内核启动或添加swap分区
  └─> swapon系统调用
      └─> do_swapoff() [mm/swapfile.c]
          └─> setup_swap_extents() [mm/swapfile.c]
              └─> init_swap_address_space() [mm/swap_state.c:710]
                  ├─> 计算需要的address_space数量
                  │   └─> nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES)
                  ├─> kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL)
                  │   └─> 分配多个address_space结构
                  └─> 对每个address_space初始化:
                      ├─> xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ)
                      │   └─> 注意:没有XA_FLAGS_ACCOUNT标志
                      ├─> atomic_set(&space->i_mmap_writable, 0)
                      ├─> space->a_ops = &swap_aops
                      └─> mapping_set_no_writeback_tags(space)
                          └─> swap cache不使用writeback相关标记
static struct inode *alloc_inode(struct super_block *sb)
{
    const struct super_operations *ops = sb->s_op;
    struct inode *inode;
 
    if (ops->alloc_inode)
        inode = ops->alloc_inode(sb);
    else
        inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);
 
    if (!inode)
        return NULL;
 
    if (unlikely(inode_init_always(sb, inode))) {
        if (ops->destroy_inode) {
            ops->destroy_inode(inode);
            if (!ops->free_inode)
                return NULL;
        }
        inode->free_inode = ops->free_inode;
        i_callback(&inode->i_rcu);
        return NULL;
    }
 
    return inode;
}
static struct inode *alloc_inode(struct super_block *sb)
{
    const struct super_operations *ops = sb->s_op;
    struct inode *inode;
 
    if (ops->alloc_inode)
        inode = ops->alloc_inode(sb);
    else
        inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);
 
    if (!inode)
        return NULL;
 
    if (unlikely(inode_init_always(sb, inode))) {
        if (ops->destroy_inode) {
            ops->destroy_inode(inode);
            if (!ops->free_inode)
                return NULL;
        }
        inode->free_inode = ops->free_inode;
        i_callback(&inode->i_rcu);
        return NULL;
    }
 
    return inode;
}
void inode_init_once(struct inode *inode)
{
    memset(inode, 0, sizeof(*inode));
    INIT_HLIST_NODE(&inode->i_hash);
    INIT_LIST_HEAD(&inode->i_devices);
    INIT_LIST_HEAD(&inode->i_io_list);
    INIT_LIST_HEAD(&inode->i_wb_list);
    INIT_LIST_HEAD(&inode->i_lru);
    INIT_LIST_HEAD(&inode->i_sb_list);
    __address_space_init_once(&inode->i_data);
    i_size_ordered_init(inode);
}
static void __address_space_init_once(struct address_space *mapping)
{
    xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
    init_rwsem(&mapping->i_mmap_rwsem);
    INIT_LIST_HEAD(&mapping->i_private_list);
    spin_lock_init(&mapping->i_private_lock);
    mapping->i_mmap = RB_ROOT_CACHED;
}
void inode_init_once(struct inode *inode)
{
    memset(inode, 0, sizeof(*inode));
    INIT_HLIST_NODE(&inode->i_hash);
    INIT_LIST_HEAD(&inode->i_devices);
    INIT_LIST_HEAD(&inode->i_io_list);
    INIT_LIST_HEAD(&inode->i_wb_list);
    INIT_LIST_HEAD(&inode->i_lru);
    INIT_LIST_HEAD(&inode->i_sb_list);
    __address_space_init_once(&inode->i_data);
    i_size_ordered_init(inode);
}
static void __address_space_init_once(struct address_space *mapping)
{
    xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
    init_rwsem(&mapping->i_mmap_rwsem);
    INIT_LIST_HEAD(&mapping->i_private_list);
    spin_lock_init(&mapping->i_private_lock);
    mapping->i_mmap = RB_ROOT_CACHED;
}
ext4模块加载
  └─> module_init(init_ext4_fs) [fs/ext4/super.c]
      └─> ext4_init_inode_table() [fs/ext4/super.c]
          └─> ext4_inode_cachep = kmem_cache_create_usercopy(
                  "ext4_inode_cache",
                  sizeof(struct ext4_inode_info),
                  0,
                  (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT),
                  offsetof(struct ext4_inode_info, i_data),
                  sizeof_field(struct ext4_inode_info, i_data),
                  init_once)  // 关键:init_once作为构造函数
              └─> kmem_cache_create_usercopy() [mm/slab_common.c]
                  └─> __kmem_cache_create()
                      └─> 创建slab缓存,注册init_once回调
 
后续任何分配inode的操作:
  └─> alloc_inode(sb) [fs/inode.c:261]
      └─> ext4_alloc_inode(sb) [fs/ext4/super.c]
          └─> kmem_cache_alloc(ext4_inode_cachep, GFP_KERNEL)
              └─> slab_alloc() [mm/slub.c]
                  └─> 如果对象是新分配的(未初始化):
                      └─> slab_post_alloc_hook()
                          └─> 调用构造函数:init_once(foo)
                              └─> inode_init_once(inode)
                                  └─> memset(inode, 0, sizeof(*inode))
                                      └─> 清零整个inode结构
                                  └─> INIT_HLIST_NODE(&inode->i_hash)
                                  └─> INIT_LIST_HEAD(&inode->i_devices)
                                  └─> INIT_LIST_HEAD(&inode->i_io_list)
                                  └─> INIT_LIST_HEAD(&inode->i_wb_list)
                                  └─> INIT_LIST_HEAD(&inode->i_lru)
                                  └─> INIT_LIST_HEAD(&inode->i_sb_list)
                                  └─> __address_space_init_once(&inode->i_data)
                                      ├─> xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT)
                                      │   └─> 初始化XArray结构
                                      │       ├─> mapping->i_pages.xa_head = NULL
                                      │       ├─> mapping->i_pages.xa_flags = XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT
                                      │       └─> 初始化xa_lock(IRQ安全的spinlock)
                                      ├─> init_rwsem(&mapping->i_mmap_rwsem)
                                      ├─> INIT_LIST_HEAD(&mapping->i_private_list)
                                      ├─> spin_lock_init(&mapping->i_private_lock)
                                      └─> mapping->i_mmap = RB_ROOT_CACHED
                                  └─> i_size_ordered_init(inode)
ext4模块加载
  └─> module_init(init_ext4_fs) [fs/ext4/super.c]
      └─> ext4_init_inode_table() [fs/ext4/super.c]
          └─> ext4_inode_cachep = kmem_cache_create_usercopy(
                  "ext4_inode_cache",
                  sizeof(struct ext4_inode_info),
                  0,
                  (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT),
                  offsetof(struct ext4_inode_info, i_data),
                  sizeof_field(struct ext4_inode_info, i_data),
                  init_once)  // 关键:init_once作为构造函数
              └─> kmem_cache_create_usercopy() [mm/slab_common.c]
                  └─> __kmem_cache_create()
                      └─> 创建slab缓存,注册init_once回调
 
后续任何分配inode的操作:
  └─> alloc_inode(sb) [fs/inode.c:261]
      └─> ext4_alloc_inode(sb) [fs/ext4/super.c]
          └─> kmem_cache_alloc(ext4_inode_cachep, GFP_KERNEL)
              └─> slab_alloc() [mm/slub.c]
                  └─> 如果对象是新分配的(未初始化):
                      └─> slab_post_alloc_hook()
                          └─> 调用构造函数:init_once(foo)
                              └─> inode_init_once(inode)
                                  └─> memset(inode, 0, sizeof(*inode))
                                      └─> 清零整个inode结构
                                  └─> INIT_HLIST_NODE(&inode->i_hash)
                                  └─> INIT_LIST_HEAD(&inode->i_devices)
                                  └─> INIT_LIST_HEAD(&inode->i_io_list)
                                  └─> INIT_LIST_HEAD(&inode->i_wb_list)
                                  └─> INIT_LIST_HEAD(&inode->i_lru)
                                  └─> INIT_LIST_HEAD(&inode->i_sb_list)
                                  └─> __address_space_init_once(&inode->i_data)
                                      ├─> xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT)
                                      │   └─> 初始化XArray结构
                                      │       ├─> mapping->i_pages.xa_head = NULL
                                      │       ├─> mapping->i_pages.xa_flags = XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT
                                      │       └─> 初始化xa_lock(IRQ安全的spinlock)
                                      ├─> init_rwsem(&mapping->i_mmap_rwsem)
                                      ├─> INIT_LIST_HEAD(&mapping->i_private_list)
                                      ├─> spin_lock_init(&mapping->i_private_lock)
                                      └─> mapping->i_mmap = RB_ROOT_CACHED
                                  └─> i_size_ordered_init(inode)
int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp)
{
        static const struct inode_operations empty_iops;
        static const struct file_operations no_open_fops = {.open = no_open};
        struct address_space *const mapping = &inode->i_data;
 
        inode->i_sb = sb;
        inode->i_blkbits = sb->s_blocksize_bits;
        inode->i_flags = 0;
        inode->i_state = 0;
        atomic64_set(&inode->i_sequence, 0);
        atomic_set(&inode->i_count, 1);
        inode->i_op = &empty_iops;
        inode->i_fop = &no_open_fops;
        inode->i_ino = 0;
        inode->__i_nlink = 1;
        inode->i_opflags = 0;
        if (sb->s_xattr)
                inode->i_opflags |= IOP_XATTR;
        i_uid_write(inode, 0);
        i_gid_write(inode, 0);
        atomic_set(&inode->i_writecount, 0);
        inode->i_size = 0;
        inode->i_write_hint = WRITE_LIFE_NOT_SET;
        inode->i_blocks = 0;
        inode->i_bytes = 0;
        inode->i_generation = 0;
        inode->i_pipe = NULL;
        inode->i_cdev = NULL;
        inode->i_link = NULL;
        inode->i_dir_seq = 0;
        inode->i_rdev = 0;
        inode->dirtied_when = 0;
 
#ifdef CONFIG_CGROUP_WRITEBACK
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
#endif
 
        spin_lock_init(&inode->i_lock);
        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
 
        init_rwsem(&inode->i_rwsem);
        lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
 
        atomic_set(&inode->i_dio_count, 0);
 
        mapping->a_ops = &empty_aops;
        mapping->host = inode;
        mapping->flags = 0;
        mapping->wb_err = 0;
        atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        atomic_set(&mapping->nr_thps, 0);
#endif
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->i_private_data = NULL;
        mapping->writeback_index = 0;
        init_rwsem(&mapping->invalidate_lock);
        lockdep_set_class_and_name(&mapping->invalidate_lock,
                                   &sb->s_type->invalidate_lock_key,
                                   "mapping.invalidate_lock");
        if (sb->s_iflags & SB_I_STABLE_WRITES)
                mapping_set_stable_writes(mapping);
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);      /* buggered by rcu freeing */
#ifdef CONFIG_FS_POSIX_ACL
        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
#endif
 
#ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
#endif
        inode->i_flctx = NULL;
 
        if (unlikely(security_inode_alloc(inode, gfp)))
                return -ENOMEM;
 
        this_cpu_inc(nr_inodes);
 
        return 0;
}
int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp)
{
        static const struct inode_operations empty_iops;
        static const struct file_operations no_open_fops = {.open = no_open};
        struct address_space *const mapping = &inode->i_data;
 
        inode->i_sb = sb;
        inode->i_blkbits = sb->s_blocksize_bits;
        inode->i_flags = 0;
        inode->i_state = 0;
        atomic64_set(&inode->i_sequence, 0);
        atomic_set(&inode->i_count, 1);
        inode->i_op = &empty_iops;
        inode->i_fop = &no_open_fops;
        inode->i_ino = 0;
        inode->__i_nlink = 1;
        inode->i_opflags = 0;
        if (sb->s_xattr)
                inode->i_opflags |= IOP_XATTR;
        i_uid_write(inode, 0);
        i_gid_write(inode, 0);
        atomic_set(&inode->i_writecount, 0);
        inode->i_size = 0;
        inode->i_write_hint = WRITE_LIFE_NOT_SET;
        inode->i_blocks = 0;
        inode->i_bytes = 0;
        inode->i_generation = 0;
        inode->i_pipe = NULL;
        inode->i_cdev = NULL;
        inode->i_link = NULL;
        inode->i_dir_seq = 0;
        inode->i_rdev = 0;
        inode->dirtied_when = 0;
 
#ifdef CONFIG_CGROUP_WRITEBACK
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
#endif
 
        spin_lock_init(&inode->i_lock);
        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
 
        init_rwsem(&inode->i_rwsem);
        lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
 
        atomic_set(&inode->i_dio_count, 0);
 
        mapping->a_ops = &empty_aops;
        mapping->host = inode;
        mapping->flags = 0;
        mapping->wb_err = 0;
        atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        atomic_set(&mapping->nr_thps, 0);
#endif
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->i_private_data = NULL;
        mapping->writeback_index = 0;
        init_rwsem(&mapping->invalidate_lock);
        lockdep_set_class_and_name(&mapping->invalidate_lock,
                                   &sb->s_type->invalidate_lock_key,
                                   "mapping.invalidate_lock");
        if (sb->s_iflags & SB_I_STABLE_WRITES)
                mapping_set_stable_writes(mapping);
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);      /* buggered by rcu freeing */
#ifdef CONFIG_FS_POSIX_ACL
        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
#endif
 
#ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
#endif
        inode->i_flctx = NULL;
 
        if (unlikely(security_inode_alloc(inode, gfp)))
                return -ENOMEM;
 
        this_cpu_inc(nr_inodes);
 
        return 0;
}
slots_index = 0x10010 >> shift & 0x3f // 为 16 < XA_CHUNK_SIZE 合法
entry = node->slots[slots_index]
slots_index = 0x10010 >> shift & 0x3f // 为 16 < XA_CHUNK_SIZE 合法
entry = node->slots[slots_index]
┌─────────────────────────────────────────────────────────────┐
│ filemap_get_folio(mapping, index)                          │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
┌─────────────────────────────────────────────────────────────┐
│ __filemap_get_folio()                                       │
│  - 初始化folio = NULL                                       │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
┌─────────────────────────────────────────────────────────────┐
│ filemap_get_entry(mapping, index)                          │
│  - XA_STATE(xas, &mapping->i_pages, index)                  │
│  - rcu_read_lock()                                          │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
┌─────────────────────────────────────────────────────────────┐
│ xas_reset(&xas)                                             │
│  - xas->xa_node = XAS_RESTART                               │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
┌─────────────────────────────────────────────────────────────┐
│ xas_load(&xas)                                              │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
┌─────────────────────────────────────────────────────────────┐
│ xas_start(&xas)                                             │
│  ├─ xas_valid()? → 检查状态                                 │
│  ├─ xa_head(xas->xa) → 获取根节点                          │
│  └─ 检查index范围                                           │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
         ┌─────────────┴─────────────┐
         │                           │
    xa_is_node(entry)?           不是节点
         │                           │
         ▼                           ▼
┌──────────────────┐        ┌──────────────────┐
│ xa_to_node()    │        │ 返回entry        │
│ 转换节点指针     │        │ (可能是folio/NULL)│
└────────┬─────────┘        └──────────────────┘
         
         
┌─────────────────────────────────────────────────────────────┐
│ xas_descend(&xas, node)                                     │
│  ├─ get_offset(index, node)                                 │
│  │   └─ (index >> node->shift) & XA_CHUNK_MASK              │
│  ├─ xa_entry(node, offset) → 读取slot                       │
│  └─ while (xa_is_sibling(entry))                            │
│      └─ 跳转到规范槽位                                       │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
         ┌─────────────┴─────────────┐
         │                           │
    xa_is_node(entry)?           不是节点
         │                           │
         ▼                           ▼
    继续下降                    返回entry
         │                           │
         └───────────┬───────────────┘
                     
                     
┌─────────────────────────────────────────────────────────────┐
│ xas_retry(&xas, entry)                                      │
│  ├─ xa_is_zero()? → 返回true,重试                          │
│  ├─ xa_is_retry()? → xas_reset(),返回true,重试            │
│  └─ 返回false,继续                                          │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
         ┌─────────────┴─────────────┐
         │                           │
    xa_is_value()?              不是value
         │                           │
         ▼                           ▼
┌──────────────────┐        ┌──────────────────┐
│ 返回shadow/swap │        │ folio_try_get()  │
│ entry,不增加引用│        │ 增加引用计数      │
└──────────────────┘        └────────┬──────────┘
                                     
                                     
┌─────────────────────────────────────────────────────────────┐
│ xas_reload(&xas)                                            │
│  - 重新读取slot,验证entry未改变                            │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
         ┌─────────────┴─────────────┐
         │                           │
    entry改变了?                 未改变
         │                           │
         ▼                           ▼
    folio_put()                返回folio
    goto repeat                 rcu_read_unlock()
┌─────────────────────────────────────────────────────────────┐
│ filemap_get_folio(mapping, index)                          │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
┌─────────────────────────────────────────────────────────────┐
│ __filemap_get_folio()                                       │
│  - 初始化folio = NULL                                       │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
┌─────────────────────────────────────────────────────────────┐
│ filemap_get_entry(mapping, index)                          │
│  - XA_STATE(xas, &mapping->i_pages, index)                  │
│  - rcu_read_lock()                                          │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
┌─────────────────────────────────────────────────────────────┐
│ xas_reset(&xas)                                             │
│  - xas->xa_node = XAS_RESTART                               │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
┌─────────────────────────────────────────────────────────────┐
│ xas_load(&xas)                                              │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
┌─────────────────────────────────────────────────────────────┐
│ xas_start(&xas)                                             │
│  ├─ xas_valid()? → 检查状态                                 │
│  ├─ xa_head(xas->xa) → 获取根节点                          │
│  └─ 检查index范围                                           │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
         ┌─────────────┴─────────────┐
         │                           │
    xa_is_node(entry)?           不是节点
         │                           │
         ▼                           ▼
┌──────────────────┐        ┌──────────────────┐
│ xa_to_node()    │        │ 返回entry        │
│ 转换节点指针     │        │ (可能是folio/NULL)│
└────────┬─────────┘        └──────────────────┘
         
         
┌─────────────────────────────────────────────────────────────┐
│ xas_descend(&xas, node)                                     │
│  ├─ get_offset(index, node)                                 │
│  │   └─ (index >> node->shift) & XA_CHUNK_MASK              │
│  ├─ xa_entry(node, offset) → 读取slot                       │
│  └─ while (xa_is_sibling(entry))                            │
│      └─ 跳转到规范槽位                                       │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
         ┌─────────────┴─────────────┐
         │                           │
    xa_is_node(entry)?           不是节点
         │                           │
         ▼                           ▼
    继续下降                    返回entry
         │                           │
         └───────────┬───────────────┘
                     
                     
┌─────────────────────────────────────────────────────────────┐
│ xas_retry(&xas, entry)                                      │
│  ├─ xa_is_zero()? → 返回true,重试                          │
│  ├─ xa_is_retry()? → xas_reset(),返回true,重试            │
│  └─ 返回false,继续                                          │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
         ┌─────────────┴─────────────┐
         │                           │
    xa_is_value()?              不是value
         │                           │
         ▼                           ▼
┌──────────────────┐        ┌──────────────────┐
│ 返回shadow/swap │        │ folio_try_get()  │
│ entry,不增加引用│        │ 增加引用计数      │
└──────────────────┘        └────────┬──────────┘
                                     
                                     
┌─────────────────────────────────────────────────────────────┐
│ xas_reload(&xas)                                            │
│  - 重新读取slot,验证entry未改变                            │
└──────────────────────┬──────────────────────────────────────┘
                       
                       
         ┌─────────────┴─────────────┐
         │                           │
    entry改变了?                 未改变
         │                           │
         ▼                           ▼
    folio_put()                返回folio
    goto repeat                 rcu_read_unlock()
index的二进制表示:
  [高位] ... [中间位] [低位]
    │         │        │
    │         │        └─> 叶子节点槽位(shift=0)
    │         └──────────> 中间节点槽位(shift=6)
    └────────────────────> 根节点槽位(shift=12)
static unsigned int get_offset(unsigned long index, struct xa_node *node)
{
    return (index >> node->shift) & XA_CHUNK_MASK;
}
index的二进制表示:
  [高位] ... [中间位] [低位]
    │         │        │
    │         │        └─> 叶子节点槽位(shift=0)
    │         └──────────> 中间节点槽位(shift=6)
    └────────────────────> 根节点槽位(shift=12)
static unsigned int get_offset(unsigned long index, struct xa_node *node)
{
    return (index >> node->shift) & XA_CHUNK_MASK;
}
Level 0 (根节点, shift=12):
  offset = (0x1234 >> 12) & 0x3F = 0x1 & 0x3F = 1
  → 访问 slots[1]
 
Level 1 (中间节点, shift=6):
  offset = (0x1234 >> 6) & 0x3F = 0x48 & 0x3F = 0x08 = 8
  → 访问 slots[8]
 
Level 2 (叶子节点, shift=0):
  offset = (0x1234 >> 0) & 0x3F = 0x1234 & 0x3F = 0x34 = 52
  → 访问 slots[52]
Level 0 (根节点, shift=12):
  offset = (0x1234 >> 12) & 0x3F = 0x1 & 0x3F = 1
  → 访问 slots[1]
 
Level 1 (中间节点, shift=6):
  offset = (0x1234 >> 6) & 0x3F = 0x48 & 0x3F = 0x08 = 8
  → 访问 slots[8]
 
Level 2 (叶子节点, shift=0):
  offset = (0x1234 >> 0) & 0x3F = 0x1234 & 0x3F = 0x34 = 52
  → 访问 slots[52]
filemap_get_pages()
  
  ├─> 步骤1:计算页索引范围
  │   ├─> index = iocb->ki_pos >> PAGE_SHIFT
  │   └─> last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE)
  
  ├─> 步骤2:第一次尝试批量查找
  │   └─> filemap_get_read_batch(mapping, index, last_index - 1, fbatch)
  │       └─> 在i_pages中查找页面
  
  ├─> 步骤3:如果未找到页面
  │   ├─> 检查IOCB_NOIO标志
  │   ├─> 执行同步预读
  │   │   └─> page_cache_sync_readahead()
  │   └─> 再次尝试批量查找
  │       └─> filemap_get_read_batch()
  
  ├─> 步骤4:如果仍未找到
  │   └─> filemap_create_folio()
  │       └─> 创建新页面并加入Pages Cache
  
  ├─> 步骤5:处理找到的页面
  │   ├─> 检查readahead标志
  │   ├─> 检查uptodate标志
  │   └─> filemap_update_page()(如果需要)
  
  └─> 返回0(成功)或错误码
static int filemap_get_pages(struct kiocb *iocb, size_t count,
                struct folio_batch *fbatch, bool need_uptodate)
{
        struct file *filp = iocb->ki_filp;
        struct address_space *mapping = filp->f_mapping;
        struct file_ra_state *ra = &filp->f_ra;
        pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
        pgoff_t last_index;
        struct folio *folio;
        unsigned int flags;
        int err = 0;
 
        /* "last_index" is the index of the page beyond the end of the read */
        last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
        if (fatal_signal_pending(current))
                return -EINTR;
 
        filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & IOCB_NOIO)
                        return -EAGAIN;
                if (iocb->ki_flags & IOCB_NOWAIT)
                        flags = memalloc_noio_save();
                page_cache_sync_readahead(mapping, ra, filp, index,
                                last_index - index);
                if (iocb->ki_flags & IOCB_NOWAIT)
                        memalloc_noio_restore(flags);
                filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        }
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
                        return -EAGAIN;
                err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
                if (err == AOP_TRUNCATED_PAGE)
                        goto retry;
                return err;
        }
 
        folio = fbatch->folios[folio_batch_count(fbatch) - 1];
        if (folio_test_readahead(folio)) {
                err = filemap_readahead(iocb, filp, mapping, folio, last_index);
                if (err)
                        goto err;
        }
        if (!folio_test_uptodate(folio)) {
                if ((iocb->ki_flags & IOCB_WAITQ) &&
                    folio_batch_count(fbatch) > 1)
                        iocb->ki_flags |= IOCB_NOWAIT;
                err = filemap_update_page(iocb, mapping, count, folio,
                                          need_uptodate);
                if (err)
                        goto err;
        }
 
        trace_mm_filemap_get_pages(mapping, index, last_index - 1);
        return 0;
err:
        if (err < 0)
                folio_put(folio);
        if (likely(--fbatch->nr))
                return 0;
        if (err == AOP_TRUNCATED_PAGE)
                goto retry;
        return err;
}
filemap_get_pages()
  
  ├─> 步骤1:计算页索引范围
  │   ├─> index = iocb->ki_pos >> PAGE_SHIFT
  │   └─> last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE)
  
  ├─> 步骤2:第一次尝试批量查找
  │   └─> filemap_get_read_batch(mapping, index, last_index - 1, fbatch)
  │       └─> 在i_pages中查找页面
  
  ├─> 步骤3:如果未找到页面
  │   ├─> 检查IOCB_NOIO标志
  │   ├─> 执行同步预读
  │   │   └─> page_cache_sync_readahead()
  │   └─> 再次尝试批量查找
  │       └─> filemap_get_read_batch()
  
  ├─> 步骤4:如果仍未找到
  │   └─> filemap_create_folio()
  │       └─> 创建新页面并加入Pages Cache
  
  ├─> 步骤5:处理找到的页面
  │   ├─> 检查readahead标志
  │   ├─> 检查uptodate标志
  │   └─> filemap_update_page()(如果需要)
  
  └─> 返回0(成功)或错误码
static int filemap_get_pages(struct kiocb *iocb, size_t count,
                struct folio_batch *fbatch, bool need_uptodate)
{
        struct file *filp = iocb->ki_filp;
        struct address_space *mapping = filp->f_mapping;
        struct file_ra_state *ra = &filp->f_ra;
        pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
        pgoff_t last_index;
        struct folio *folio;
        unsigned int flags;
        int err = 0;
 
        /* "last_index" is the index of the page beyond the end of the read */
        last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
        if (fatal_signal_pending(current))
                return -EINTR;
 
        filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & IOCB_NOIO)
                        return -EAGAIN;
                if (iocb->ki_flags & IOCB_NOWAIT)
                        flags = memalloc_noio_save();
                page_cache_sync_readahead(mapping, ra, filp, index,
                                last_index - index);
                if (iocb->ki_flags & IOCB_NOWAIT)
                        memalloc_noio_restore(flags);
                filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        }
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
                        return -EAGAIN;
                err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
                if (err == AOP_TRUNCATED_PAGE)
                        goto retry;
                return err;
        }
 
        folio = fbatch->folios[folio_batch_count(fbatch) - 1];
        if (folio_test_readahead(folio)) {
                err = filemap_readahead(iocb, filp, mapping, folio, last_index);
                if (err)
                        goto err;
        }
        if (!folio_test_uptodate(folio)) {
                if ((iocb->ki_flags & IOCB_WAITQ) &&
                    folio_batch_count(fbatch) > 1)
                        iocb->ki_flags |= IOCB_NOWAIT;
                err = filemap_update_page(iocb, mapping, count, folio,
                                          need_uptodate);
                if (err)
                        goto err;
        }
 
        trace_mm_filemap_get_pages(mapping, index, last_index - 1);
        return 0;
err:
        if (err < 0)
                folio_put(folio);
        if (likely(--fbatch->nr))
                return 0;
        if (err == AOP_TRUNCATED_PAGE)
                goto retry;
        return err;
}
void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct folio *folio;
 
        rcu_read_lock();
repeat:
        xas_reset(&xas);
        folio = xas_load(&xas);
        if (xas_retry(&xas, folio))
                goto repeat;
        /*
         * A shadow entry of a recently evicted page, or a swap entry from
         * shmem/tmpfs.  Return it without attempting to raise page count.
         */
        if (!folio || xa_is_value(folio))
                goto out;
 
        if (!folio_try_get(folio))
                goto repeat;
 
        if (unlikely(folio != xas_reload(&xas))) {
                folio_put(folio);
                goto repeat;
        }
out:
        rcu_read_unlock();
 
        return folio;
}
void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct folio *folio;
 
        rcu_read_lock();
repeat:
        xas_reset(&xas);
        folio = xas_load(&xas);
        if (xas_retry(&xas, folio))
                goto repeat;
        /*
         * A shadow entry of a recently evicted page, or a swap entry from
         * shmem/tmpfs.  Return it without attempting to raise page count.
         */
        if (!folio || xa_is_value(folio))
                goto out;
 
        if (!folio_try_get(folio))
                goto repeat;
 
        if (unlikely(folio != xas_reload(&xas))) {
                folio_put(folio);
                goto repeat;
        }
out:
        rcu_read_unlock();
 
        return folio;
}
void *xas_load(struct xa_state *xas)
{
    void *entry = xas_start(xas);
 
    while (xa_is_node(entry)) {
        struct xa_node *node = xa_to_node(entry);
 
        if (xas->xa_shift > node->shift)
            break;
        entry = xas_descend(xas, node);
        if (node->shift == 0)
            break;
    }
    return entry;
}
void *xas_load(struct xa_state *xas)
{
    void *entry = xas_start(xas);
 
    while (xa_is_node(entry)) {
        struct xa_node *node = xa_to_node(entry);
 
        if (xas->xa_shift > node->shift)
            break;
        entry = xas_descend(xas, node);
        if (node->shift == 0)
            break;
    }
    return entry;
}
用户访问mmap内存
  └─> 触发缺页中断
      └─> handle_mm_fault()
          └─> do_fault()
              └─> do_read_fault()
                  ├─> do_fault_around()              [fault-around预读]
                  └─> __do_fault()
                      └─> vma->vm_ops->fault()
                          └─> filemap_fault()
                              ├─> filemap_get_folio() [查找PageCache]
                              ├─> do_async_mmap_readahead() [异步预读]
                              ├─> do_sync_mmap_readahead()  [同步预读]
                              ├─> __filemap_get_folio()     [创建页面]
                              ├─> lock_folio_maybe_drop_mmap() [加锁]
                              └─> filemap_read_folio()       [读取数据]
                                  └─> mapping->a_ops->read_folio()
用户访问mmap内存
  └─> 触发缺页中断
      └─> handle_mm_fault()
          └─> do_fault()
              └─> do_read_fault()
                  ├─> do_fault_around()              [fault-around预读]
                  └─> __do_fault()
                      └─> vma->vm_ops->fault()
                          └─> filemap_fault()
                              ├─> filemap_get_folio() [查找PageCache]
                              ├─> do_async_mmap_readahead() [异步预读]
                              ├─> do_sync_mmap_readahead()  [同步预读]
                              ├─> __filemap_get_folio()     [创建页面]
                              ├─> lock_folio_maybe_drop_mmap() [加锁]
                              └─> filemap_read_folio()       [读取数据]
                                  └─> mapping->a_ops->read_folio()
同步预读:
page_cache_sync_readahead()
  └─> page_cache_ra_order()
      └─> read_pages()
          └─> mapping->a_ops->readahead()
              └─> readahead_folio() [逐个获取页面]
 
异步预读:
page_cache_async_ra()
  └─> page_cache_ra_order()
      └─> read_pages()
          └─> mapping->a_ops->readahead()
同步预读:
page_cache_sync_readahead()
  └─> page_cache_ra_order()
      └─> read_pages()
          └─> mapping->a_ops->readahead()
              └─> readahead_folio() [逐个获取页面]
 
异步预读:
page_cache_async_ra()
  └─> page_cache_ra_order()
      └─> read_pages()
          └─> mapping->a_ops->readahead()
┌─────────────────────────────────────────────────────────────┐
│                    PageCache读取流程                          │
└─────────────────────────────────────────────────────────────┘
                              
                              
        ┌─────────────────────────────────────┐
        │   读取请求(read/mmap)              │
        └─────────────────────────────────────┘
                              
                ┌─────────────┴─────────────┐
                │                           │
                ▼                           ▼
    ┌──────────────────┐        ┌──────────────────┐
    │  read()系统调用   │        │  mmap缺页中断     │
    └──────────────────┘        └──────────────────┘
                │                           │
                ▼                           ▼
    ┌──────────────────┐        ┌──────────────────┐
    │ filemap_read()    │        │ filemap_fault()  │
    └──────────────────┘        └──────────────────┘
                │                           │
                └─────────────┬─────────────┘
                              
                              
                ┌─────────────────────┐
                │ 查找PageCache        │
                │ filemap_get_folio() │
                └─────────────────────┘
                              
                ┌─────────────┴─────────────┐
                │                           │
         ┌──────▼──────┐          ┌─────────▼─────────┐
         │ 页面在缓存中 │          │ 页面不在缓存中    │
         └──────┬──────┘          └─────────┬─────────┘
                │                           │
    ┌───────────┴──────────┐               │
    │                       │               │
    ▼                       ▼               ▼
┌──────────┐      ┌──────────────┐  ┌──────────────┐
│ 最新页面  │      │ 非最新页面    │  │ 触发预读      │
│ (uptodate)│      │ (!uptodate)  │  │ readahead()  │
└────┬─────┘      └──────┬───────┘  └──────┬───────┘
     │                   │                  │
     │                   ▼                  │
     │          ┌─────────────────┐         │
     │          │ filemap_read_   │         │
     │          │ folio()         │         │
     │          └────────┬────────┘         │
     │                   │                  │
     └──────────────────┴──────────────────┘
                        
                        
            ┌──────────────────────┐
            │ 调用文件系统read_folio│
            │ mapping->a_ops->     │
            │ read_folio()         │
            └──────────┬───────────┘
                       
                       
            ┌──────────────────────┐
            │ 从磁盘读取数据到页面   │
            │ 设置PG_uptodate标志   │
            └──────────┬───────────┘
                       
                       
            ┌──────────────────────┐
            │ 返回数据给用户空间     │
            │ (copy_folio_to_iter)  │
            └──────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│                    PageCache读取流程                          │
└─────────────────────────────────────────────────────────────┘
                              
                              
        ┌─────────────────────────────────────┐
        │   读取请求(read/mmap)              │
        └─────────────────────────────────────┘
                              
                ┌─────────────┴─────────────┐
                │                           │
                ▼                           ▼
    ┌──────────────────┐        ┌──────────────────┐
    │  read()系统调用   │        │  mmap缺页中断     │
    └──────────────────┘        └──────────────────┘
                │                           │
                ▼                           ▼
    ┌──────────────────┐        ┌──────────────────┐
    │ filemap_read()    │        │ filemap_fault()  │
    └──────────────────┘        └──────────────────┘
                │                           │
                └─────────────┬─────────────┘
                              
                              
                ┌─────────────────────┐
                │ 查找PageCache        │
                │ filemap_get_folio() │
                └─────────────────────┘
                              
                ┌─────────────┴─────────────┐
                │                           │
         ┌──────▼──────┐          ┌─────────▼─────────┐
         │ 页面在缓存中 │          │ 页面不在缓存中    │
         └──────┬──────┘          └─────────┬─────────┘
                │                           │
    ┌───────────┴──────────┐               │
    │                       │               │
    ▼                       ▼               ▼
┌──────────┐      ┌──────────────┐  ┌──────────────┐
│ 最新页面  │      │ 非最新页面    │  │ 触发预读      │
│ (uptodate)│      │ (!uptodate)  │  │ readahead()  │
└────┬─────┘      └──────┬───────┘  └──────┬───────┘
     │                   │                  │
     │                   ▼                  │
     │          ┌─────────────────┐         │
     │          │ filemap_read_   │         │
     │          │ folio()         │         │
     │          └────────┬────────┘         │
     │                   │                  │
     └──────────────────┴──────────────────┘
                        
                        
            ┌──────────────────────┐
            │ 调用文件系统read_folio│
            │ mapping->a_ops->     │
            │ read_folio()         │
            └──────────┬───────────┘
                       
                       
            ┌──────────────────────┐
            │ 从磁盘读取数据到页面   │
            │ 设置PG_uptodate标志   │
            └──────────┬───────────┘
                       
                       
            ┌──────────────────────┐
            │ 返回数据给用户空间     │
            │ (copy_folio_to_iter)  │
            └──────────────────────┘
read()
  └─> vfs_read()
      └─> file->f_op->read_iter()
          └─> generic_file_read_iter()
              └─> filemap_read()
                  ├─> filemap_get_pages()
                  │   ├─> filemap_get_read_batch()  [从PageCache查找]
                  │   ├─> page_cache_sync_readahead() [缓存未命中时预读]
                  │   ├─> filemap_create_folio()     [创建新页面]
                  │   │   └─> filemap_read_folio()
                  │   │       └─> mapping->a_ops->read_folio()
                  │   └─> filemap_update_page()      [更新非最新页面]
                  │       └─> filemap_read_folio()
                  └─> copy_folio_to_iter()            [复制到用户空间]
read()
  └─> vfs_read()
      └─> file->f_op->read_iter()
          └─> generic_file_read_iter()
              └─> filemap_read()
                  ├─> filemap_get_pages()
                  │   ├─> filemap_get_read_batch()  [从PageCache查找]
                  │   ├─> page_cache_sync_readahead() [缓存未命中时预读]
                  │   ├─> filemap_create_folio()     [创建新页面]
                  │   │   └─> filemap_read_folio()
                  │   │       └─> mapping->a_ops->read_folio()
                  │   └─> filemap_update_page()      [更新非最新页面]
                  │       └─> filemap_read_folio()
                  └─> copy_folio_to_iter()            [复制到用户空间]
/**
 * filemap_read - Read data from the page cache.
 * @iocb: The iocb to read.
 * @iter: Destination for the data.
 * @already_read: Number of bytes already read by the caller.
 *
 * Copies data from the page cache.  If the data is not currently present,
 * uses the readahead and read_folio address_space operations to fetch it.
 *
 * Return: Total number of bytes copied, including those already read by
 * the caller.  If an error happens before any bytes are copied, returns
 * a negative error number.
 */
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t already_read)
{
        struct file *filp = iocb->ki_filp;
        struct file_ra_state *ra = &filp->f_ra;
        struct address_space *mapping = filp->f_mapping;
        struct inode *inode = mapping->host;
        struct folio_batch fbatch;
        int i, error = 0;
        bool writably_mapped;
        loff_t isize, end_offset;
        loff_t last_pos = ra->prev_pos;
 
        if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
                return 0;
        if (unlikely(!iov_iter_count(iter)))
                return 0;
 
        iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
        folio_batch_init(&fbatch);
 
        do {
                cond_resched();
 
                /*
                 * If we've already successfully copied some data, then we
                 * can no longer safely return -EIOCBQUEUED. Hence mark
                 * an async read NOWAIT at that point.
                 */
                if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
                        iocb->ki_flags |= IOCB_NOWAIT;
 
                if (unlikely(iocb->ki_pos >= i_size_read(inode)))
                        break;
 
                error = filemap_get_pages(iocb, iter->count, &fbatch, false);
                if (error < 0)
                        break;
 
                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(inode);
                if (unlikely(iocb->ki_pos >= isize))
                        goto put_folios;
                end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
 
                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
                 */
                writably_mapped = mapping_writably_mapped(mapping);
 
                /*
                 * When a read accesses the same folio several times, only
                 * mark it as accessed the first time.
                 */
                if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
                                    fbatch.folios[0]))
                        folio_mark_accessed(fbatch.folios[0]);
 
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];
                        size_t fsize = folio_size(folio);
                        size_t offset = iocb->ki_pos & (fsize - 1);
                        size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
                                             fsize - offset);
                        size_t copied;
 
                        if (end_offset < folio_pos(folio))
                                break;
                        if (i > 0)
                                folio_mark_accessed(folio);
                        /*
                         * If users can be writing to this folio using arbitrary
                         * virtual addresses, take care of potential aliasing
                         * before reading the folio on the kernel side.
                         */
                        if (writably_mapped)
                                flush_dcache_folio(folio);
 
                        copied = copy_folio_to_iter(folio, offset, bytes, iter);
 
                        already_read += copied;
                        iocb->ki_pos += copied;
                        last_pos = iocb->ki_pos;
 
                        if (copied < bytes) {
                                error = -EFAULT;
                                break;
                        }
                }
put_folios:
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        folio_put(fbatch.folios[i]);
                folio_batch_init(&fbatch);
        } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
        file_accessed(filp);
        ra->prev_pos = last_pos;
        return already_read ? already_read : error;
}
/**
 * filemap_read - Read data from the page cache.
 * @iocb: The iocb to read.
 * @iter: Destination for the data.
 * @already_read: Number of bytes already read by the caller.
 *
 * Copies data from the page cache.  If the data is not currently present,
 * uses the readahead and read_folio address_space operations to fetch it.
 *
 * Return: Total number of bytes copied, including those already read by
 * the caller.  If an error happens before any bytes are copied, returns
 * a negative error number.
 */
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t already_read)
{
        struct file *filp = iocb->ki_filp;
        struct file_ra_state *ra = &filp->f_ra;
        struct address_space *mapping = filp->f_mapping;
        struct inode *inode = mapping->host;
        struct folio_batch fbatch;
        int i, error = 0;
        bool writably_mapped;
        loff_t isize, end_offset;
        loff_t last_pos = ra->prev_pos;
 
        if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
                return 0;
        if (unlikely(!iov_iter_count(iter)))
                return 0;
 
        iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
        folio_batch_init(&fbatch);
 
        do {
                cond_resched();
 
                /*
                 * If we've already successfully copied some data, then we
                 * can no longer safely return -EIOCBQUEUED. Hence mark
                 * an async read NOWAIT at that point.
                 */
                if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
                        iocb->ki_flags |= IOCB_NOWAIT;
 
                if (unlikely(iocb->ki_pos >= i_size_read(inode)))
                        break;
 
                error = filemap_get_pages(iocb, iter->count, &fbatch, false);
                if (error < 0)
                        break;
 
                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(inode);
                if (unlikely(iocb->ki_pos >= isize))
                        goto put_folios;
                end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
 
                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
                 */
                writably_mapped = mapping_writably_mapped(mapping);
 
                /*
                 * When a read accesses the same folio several times, only
                 * mark it as accessed the first time.
                 */
                if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
                                    fbatch.folios[0]))
                        folio_mark_accessed(fbatch.folios[0]);
 
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];
                        size_t fsize = folio_size(folio);
                        size_t offset = iocb->ki_pos & (fsize - 1);
                        size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
                                             fsize - offset);
                        size_t copied;
 
                        if (end_offset < folio_pos(folio))
                                break;
                        if (i > 0)
                                folio_mark_accessed(folio);
                        /*
                         * If users can be writing to this folio using arbitrary
                         * virtual addresses, take care of potential aliasing
                         * before reading the folio on the kernel side.
                         */
                        if (writably_mapped)
                                flush_dcache_folio(folio);
 
                        copied = copy_folio_to_iter(folio, offset, bytes, iter);
 
                        already_read += copied;
                        iocb->ki_pos += copied;
                        last_pos = iocb->ki_pos;
 
                        if (copied < bytes) {
                                error = -EFAULT;
                                break;
                        }
                }
put_folios:
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        folio_put(fbatch.folios[i]);
                folio_batch_init(&fbatch);
        } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
        file_accessed(filp);
        ra->prev_pos = last_pos;
        return already_read ? already_read : error;
}
static int filemap_get_pages(struct kiocb *iocb, size_t count,
                struct folio_batch *fbatch, bool need_uptodate)
{
        struct file *filp = iocb->ki_filp;
        struct address_space *mapping = filp->f_mapping;
        struct file_ra_state *ra = &filp->f_ra;
        pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
        pgoff_t last_index;
        struct folio *folio;
        unsigned int flags;
        int err = 0;
 
        /* "last_index" is the index of the page beyond the end of the read */
        last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
        if (fatal_signal_pending(current))
                return -EINTR;
 
        filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & IOCB_NOIO)
                        return -EAGAIN;
                if (iocb->ki_flags & IOCB_NOWAIT)
                        flags = memalloc_noio_save();
                page_cache_sync_readahead(mapping, ra, filp, index,
                                last_index - index);
                if (iocb->ki_flags & IOCB_NOWAIT)
                        memalloc_noio_restore(flags);
                filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        }
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
                        return -EAGAIN;
                err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
                if (err == AOP_TRUNCATED_PAGE)
                        goto retry;
                return err;
        }
 
        folio = fbatch->folios[folio_batch_count(fbatch) - 1];
        if (folio_test_readahead(folio)) {
                err = filemap_readahead(iocb, filp, mapping, folio, last_index);
                if (err)
                        goto err;
        }
        if (!folio_test_uptodate(folio)) {
                if ((iocb->ki_flags & IOCB_WAITQ) &&
                    folio_batch_count(fbatch) > 1)
                        iocb->ki_flags |= IOCB_NOWAIT;
                err = filemap_update_page(iocb, mapping, count, folio,
                                          need_uptodate);
                if (err)
                        goto err;
        }
 
        trace_mm_filemap_get_pages(mapping, index, last_index - 1);
        return 0;
err:
        if (err < 0)
                folio_put(folio);
        if (likely(--fbatch->nr))
                return 0;
        if (err == AOP_TRUNCATED_PAGE)
                goto retry;
        return err;
}
/*
 * filemap_get_read_batch - Get a batch of folios for read
 *
 * Get a batch of folios which represent a contiguous range of bytes in
 * the file.  No exceptional entries will be returned.  If @index is in
 * the middle of a folio, the entire folio will be returned.  The last
 * folio in the batch may have the readahead flag set or the uptodate flag
 * clear so that the caller can take the appropriate action.
 */
 /*
 使用RCU从radix tree批量获取页面
    遇到非最新或预读标记页面时停止
    返回连续范围的页面批次
 */
static void filemap_get_read_batch(struct address_space *mapping,
                pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct folio *folio;
 
        rcu_read_lock();
        for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;
                if (xas.xa_index > max || xa_is_value(folio))
                        break;
                if (xa_is_sibling(folio))
                        break;
                if (!folio_try_get(folio))
                        goto retry;
 
                if (unlikely(folio != xas_reload(&xas)))
                        goto put_folio;
 
                if (!folio_batch_add(fbatch, folio))
                        break;
                if (!folio_test_uptodate(folio))
                        break;
                if (folio_test_readahead(folio))
                        break;
                xas_advance(&xas, folio_next_index(folio) - 1);
                continue;
put_folio:
                folio_put(folio);
retry:
                xas_reset(&xas);
        }
        rcu_read_unlock();
}
static int filemap_get_pages(struct kiocb *iocb, size_t count,
                struct folio_batch *fbatch, bool need_uptodate)
{
        struct file *filp = iocb->ki_filp;
        struct address_space *mapping = filp->f_mapping;
        struct file_ra_state *ra = &filp->f_ra;
        pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
        pgoff_t last_index;
        struct folio *folio;
        unsigned int flags;
        int err = 0;
 
        /* "last_index" is the index of the page beyond the end of the read */
        last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
        if (fatal_signal_pending(current))
                return -EINTR;
 
        filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & IOCB_NOIO)
                        return -EAGAIN;
                if (iocb->ki_flags & IOCB_NOWAIT)
                        flags = memalloc_noio_save();
                page_cache_sync_readahead(mapping, ra, filp, index,
                                last_index - index);
                if (iocb->ki_flags & IOCB_NOWAIT)
                        memalloc_noio_restore(flags);
                filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        }
        if (!folio_batch_count(fbatch)) {
                if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
                        return -EAGAIN;
                err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch);
                if (err == AOP_TRUNCATED_PAGE)
                        goto retry;
                return err;
        }
 
        folio = fbatch->folios[folio_batch_count(fbatch) - 1];
        if (folio_test_readahead(folio)) {
                err = filemap_readahead(iocb, filp, mapping, folio, last_index);
                if (err)
                        goto err;
        }
        if (!folio_test_uptodate(folio)) {
                if ((iocb->ki_flags & IOCB_WAITQ) &&
                    folio_batch_count(fbatch) > 1)
                        iocb->ki_flags |= IOCB_NOWAIT;
                err = filemap_update_page(iocb, mapping, count, folio,
                                          need_uptodate);
                if (err)
                        goto err;
        }
 
        trace_mm_filemap_get_pages(mapping, index, last_index - 1);
        return 0;
err:
        if (err < 0)

[培训]Windows内核深度攻防:从Hook技术到Rootkit实战!

最后于 2025-12-4 17:05 被Elenia编辑 ,原因: 修正
收藏
免费 0
支持
分享
最新回复 (1)
雪    币: 5630
活跃值: (9442)
能力值: ( LV2,RANK:10 )
在线值:
发帖
回帖
粉丝
2
2025-12-5 09:13
0
游客
登录 | 注册 方可回帖
返回