A reference count tracks the number of users an object (such as a page in memory)
has, allowing the kernel to determine when the object is free and can be deleted.
There are actually two types of reference counts for a normal page.
The first, stored in the _count field of struct page, is the total number of
references held to the page.
The second, kept in _mapcount, is the number of page table entries referring to this page.
A page-table mapping is a reference, so every such reference counted in _mapcount
is also tracked in _count; the latter should thus always be greater than or equal
to the former. Situations where _count can exceed _mapcount include pages mapped
for DMA and pages mapped into the kernel's address space with a function like
get_user_pages(). Locking a page into memory with mlock() will also increase _count.
The relative value of these two counters is important; if _count equals _mapcount,
the page can be reclaimed by locating and removing all of the page table entries.
But if _count is greater than _mapcount, the page is "pinned" and cannot be
reclaimed until the extra references are removed.
int total_mapcount(struct page *page)
{
int i, compound, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) + 1;
compound = compound_mapcount(page);
if (PageHuge(page))
return compound;
ret = compound;
for (i = 0; i < HPAGE_PMD_NR; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
if (!PageAnon(page))
return ret - compound * HPAGE_PMD_NR;
if (PageDoubleMap(page))
ret -= HPAGE_PMD_NR;
return ret;
}
In current kernel, a specific 4KB page can be treated as an individual
page, or it can be part of a huge page, but not both. If a huge page must
be split into individual pages, it is split completely for all users, the
compound page structure is torn down, and the huge page no longer exists.
static inline void get_page(struct page *page)
{
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count. Only if
* we're getting a tail page, the elevated page->_count is
* required only in the head page, so for tail pages the
* bugcheck only verifies that the page->_count isn't
* negative.
*/
VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
atomic_inc(&page->_count);
/*
* Getting a tail page will elevate both the head and tail
* page->_count(s).
*/
if (unlikely(PageTail(page))) {
/*
* This is safe only because
* __split_huge_page_refcount can't run under
* get_page().
*/
VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
atomic_inc(&page->first_page->_count);
}
}
The fundamental change in Kirill's patch set is to allow a huge page to be
split in one process's address space, while remaining a huge page in any
other address space where it is found.
从此PMD页和PTE页就再也不是相互不兼容的了。
接着我们来看看究竟是对mapcount做了什么改动。
第一个变化来自 ddc58f27f9ee 2016-01-15 mm: drop tail page refcounting。这个提交将原先get_page的改动又恢复了回来。为了减少篇幅,我们直接看改动后的最后结果。
static inline void get_page(struct page *page)
{
page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
atomic_inc(&page->_count);
}
static inline void init_page_count(struct page *page)
{
atomic_set(&page->_count, 1);
}
static inline void put_page(struct page *page)
{
page = compound_head(page);
if (put_page_testzero(page))
__put_page(page);
}
The idea is to store separately how many times the page was mapped as
whole -- compound_mapcount.
Any time we map/unmap whole compound page (THP or hugetlb) -- we
increment/decrement compound_mapcount. When we map part of compound
page with PTE we operate on ->_mapcount of the subpage.
PageDoubleMap indicates that the compound page is mapped with PTEs as well
as PMDs.
This is required for optimization of rmap operations for THP: we can
postpone per small page mapcount accounting (and its overhead from atomic
operations) until the first PMD split.
For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
by one. This reference will go away with last compound_mapcount.
/*
* Set PG_double_map before dropping compound_mapcount to avoid
* false-negative page_mapped().
*/
if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
for (i = 0; i < HPAGE_PMD_NR; i++)
atomic_inc(&page[i]._mapcount);
}
if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
/* Last compound_mapcount is gone. */
__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
if (TestClearPageDoubleMap(page)) {
/* No need in mapcount reference anymore */
for (i = 0; i < HPAGE_PMD_NR; i++)
atomic_dec(&page[i]._mapcount);
}
}
PG_double_map optimization doesn't work for file pages since lifecycle
of file pages is different comparing to anon pages: file page can be
mapped again at any time.
看来和lru还有关系,这点我还得继续好好研究了。好了,那我们来看看这个版本中带来的变化。
在这个版本,也就是上述提交中,重要变化的是这三个函数
page_add_file_rmap
total_mapcount
__page_mapcount
变化前:
void page_add_file_rmap(struct page *page)
{
lock_page_memcg(page);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
}
unlock_page_memcg(page);
}
int total_mapcount(struct page *page)
{
int i, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) + 1;
ret = compound_mapcount(page);
if (PageHuge(page))
return ret;
for (i = 0; i < HPAGE_PMD_NR; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
if (PageDoubleMap(page))
ret -= HPAGE_PMD_NR;
return ret;
}
int __page_mapcount(struct page *page)
{
int ret;
ret = atomic_read(&page->_mapcount) + 1;
page = compound_head(page);
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
if (PageDoubleMap(page))
ret--;
return ret;
}
变化后:
void page_add_file_rmap(struct page *page, bool compound)
{
int i, nr = 1;
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
} else {
if (!atomic_inc_and_test(&page->_mapcount))
goto out;
}
__mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr);
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
out:
unlock_page_memcg(page);
}
int total_mapcount(struct page *page)
{
int i, compound, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) + 1;
compound = compound_mapcount(page);
if (PageHuge(page))
return compound;
ret = compound;
for (i = 0; i < HPAGE_PMD_NR; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
if (!PageAnon(page))
return ret - compound * HPAGE_PMD_NR;
if (PageDoubleMap(page))
ret -= HPAGE_PMD_NR;
return ret;
}
int __page_mapcount(struct page *page)
{
int ret;
ret = atomic_read(&page->_mapcount) + 1;
/*
* For file THP page->_mapcount contains total number of mapping
* of the page: no need to look into compound_mapcount.
*/
if (!PageAnon(page) && !PageHuge(page))
return ret;
page = compound_head(page);
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
if (PageDoubleMap(page))
ret--;
return ret;
}