A reference count tracks the number of users an object (such as a page in memory)
has, allowing the kernel to determine when the object is free and can be deleted.
There are actually two types of reference counts for a normal page.
The first, stored in the _count field of struct page, is the total number of
references held to the page.
The second, kept in _mapcount, is the number of page table entries referring to this page.
A page-table mapping is a reference, so every such reference counted in _mapcount
is also tracked in _count; the latter should thus always be greater than or equal
to the former. Situations where _count can exceed _mapcount include pages mapped
for DMA and pages mapped into the kernel's address space with a function like
get_user_pages(). Locking a page into memory with mlock() will also increase _count.
The relative value of these two counters is important; if _count equals _mapcount,
the page can be reclaimed by locating and removing all of the page table entries.
But if _count is greater than _mapcount, the page is "pinned" and cannot be
reclaimed until the extra references are removed.
int total_mapcount(struct page *page)
{
int i, compound, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) + 1;
compound = compound_mapcount(page);
if (PageHuge(page))
return compound;
ret = compound;
for (i = 0; i < HPAGE_PMD_NR; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
if (!PageAnon(page))
return ret - compound * HPAGE_PMD_NR;
if (PageDoubleMap(page))
ret -= HPAGE_PMD_NR;
return ret;
}
In current kernel, a specific 4KB page can be treated as an individual
page, or it can be part of a huge page, but not both. If a huge page must
be split into individual pages, it is split completely for all users, the
compound page structure is torn down, and the huge page no longer exists.
static inline void get_page(struct page *page)
{
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count. Only if
* we're getting a tail page, the elevated page->_count is
* required only in the head page, so for tail pages the
* bugcheck only verifies that the page->_count isn't
* negative.
*/
VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
atomic_inc(&page->_count);
/*
* Getting a tail page will elevate both the head and tail
* page->_count(s).
*/
if (unlikely(PageTail(page))) {
/*
* This is safe only because
* __split_huge_page_refcount can't run under
* get_page().
*/
VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
atomic_inc(&page->first_page->_count);
}
}
__split_huge_page_refcount(), split the thp into normal pages
atomic_sub(page_tail->_count, page->_count)
atomic_add(page_mapcount(page) + 1, page_tail->_count)
page_tail->_mapcount = page->_mapcount
put_page(page_tail)
a. new thp with __do_huge_pmd_anonymous_page()
page[0] +-----------------------+
|_count | = 1 page refcount set to 1
|_mapcount | = -1 -> 0 from -1 to 0
page[1] +-----------------------+
|_count | = 0
|_mapcount | = -1
+-----------------------+
b. just get_page(page[1]) for example
page[0] +-----------------------+
|_count | = 1 -> 2 add 1 to count page[1]
|_mapcount | = 0
page[1] +-----------------------+
|_count | = 0 -> 1 add 1 for itself
|_mapcount | = -1
+-----------------------+
c. __split_huge_page_refcount()
page[0] +-----------------------+
|_count | = 2 -> 1 dec 1 for page[1]._count
|_mapcount | = 0
page[1] +-----------------------+
|_count | = 1 -> 2 one for mapping, one for get_page()
|_mapcount | = -1 -> 0 set to page[0]._mapcount
+-----------------------+
d. put_pge(page[1])
Since page is not a compound page anymore, just decrease _count is enough.
page[0] +-----------------------+
|_count | = 1
|_mapcount | = 0
page[1] +-----------------------+
|_count | = 2 -> 1 dec 1
|_mapcount | = 0
+-----------------------+
The fundamental change in Kirill's patch set is to allow a huge page to be
split in one process's address space, while remaining a huge page in any
other address space where it is found.
static inline void get_page(struct page *page)
{
page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
atomic_inc(&page->_count);
}
static inline void init_page_count(struct page *page)
{
atomic_set(&page->_count, 1);
}
static inline void put_page(struct page *page)
{
page = compound_head(page);
if (put_page_testzero(page))
__put_page(page);
}
* compound_mapcount的引入
* PageDoubleMap的引入
The idea is to store separately how many times the page was mapped as
whole -- compound_mapcount.
Any time we map/unmap whole compound page (THP or hugetlb) -- we
increment/decrement compound_mapcount. When we map part of compound
page with PTE we operate on ->_mapcount of the subpage.
PageDoubleMap indicates that the compound page is mapped with PTEs as well
as PMDs.
This is required for optimization of rmap operations for THP: we can
postpone per small page mapcount accounting (and its overhead from atomic
operations) until the first PMD split.
For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up
by one. This reference will go away with last compound_mapcount.
Before:
static inline int page_mapcount(struct page *page)
{
VM_BUG_ON_PAGE(PageSlab(page), page);
return atomic_read(&page->_mapcount) + 1;
}
After:
static inline int page_mapcount(struct page *page)
{
int ret;
VM_BUG_ON_PAGE(PageSlab(page), page);
ret = atomic_read(&page->_mapcount) + 1;
if (PageCompound(page)) {
page = compound_head(page);
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
if (PageDoubleMap(page))
ret--;
}
return ret;
}
/*
* Set PG_double_map before dropping compound_mapcount to avoid
* false-negative page_mapped().
*/
if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
for (i = 0; i < HPAGE_PMD_NR; i++)
atomic_inc(&page[i]._mapcount);
}
if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
/* Last compound_mapcount is gone. */
__dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
if (TestClearPageDoubleMap(page)) {
/* No need in mapcount reference anymore */
for (i = 0; i < HPAGE_PMD_NR; i++)
atomic_dec(&page[i]._mapcount);
}
}
PG_double_map optimization doesn't work for file pages since lifecycle
of file pages is different comparing to anon pages: file page can be
mapped again at any time.
page_add_file_rmap
total_mapcount
__page_mapcount
void page_add_file_rmap(struct page *page)
{
lock_page_memcg(page);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
}
unlock_page_memcg(page);
}
int total_mapcount(struct page *page)
{
int i, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) + 1;
ret = compound_mapcount(page);
if (PageHuge(page))
return ret;
for (i = 0; i < HPAGE_PMD_NR; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
if (PageDoubleMap(page))
ret -= HPAGE_PMD_NR;
return ret;
}
int __page_mapcount(struct page *page)
{
int ret;
ret = atomic_read(&page->_mapcount) + 1;
page = compound_head(page);
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
if (PageDoubleMap(page))
ret--;
return ret;
}
void page_add_file_rmap(struct page *page, bool compound)
{
int i, nr = 1;
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
} else {
if (!atomic_inc_and_test(&page->_mapcount))
goto out;
}
__mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr);
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
out:
unlock_page_memcg(page);
}
int total_mapcount(struct page *page)
{
int i, compound, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) + 1;
compound = compound_mapcount(page);
if (PageHuge(page))
return compound;
ret = compound;
for (i = 0; i < HPAGE_PMD_NR; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
if (!PageAnon(page))
return ret - compound * HPAGE_PMD_NR;
if (PageDoubleMap(page))
ret -= HPAGE_PMD_NR;
return ret;
}
int __page_mapcount(struct page *page)
{
int ret;
ret = atomic_read(&page->_mapcount) + 1;
/*
* For file THP page->_mapcount contains total number of mapping
* of the page: no need to look into compound_mapcount.
*/
if (!PageAnon(page) && !PageHuge(page))
return ret;
page = compound_head(page);
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
if (PageDoubleMap(page))
ret--;
return ret;
}
commit b14224fbea62e5bffd680613376fe1268f4103ba
Author: Matthew Wilcox (Oracle) <willy@infradead.org>
Date: Wed Jan 11 14:28:50 2023 +0000
mm: convert total_compound_mapcount() to folio_total_mapcount()
Instead of enforcing that the argument must be a head page by naming,
enforce it with the compiler by making it a folio. Also rename the
counter in struct folio from _compound_mapcount to _entire_mapcount.
int folio_total_mapcount(const struct folio *folio)
{
int mapcount = folio_entire_mapcount(folio);
int nr_pages;
int i;
/* In the common case, avoid the loop when no pages mapped by PTE */
if (folio_nr_pages_mapped(folio) == 0)
return mapcount;
/*
* Add all the PTE mappings of those pages mapped by PTE.
* Limit the loop to folio_nr_pages_mapped()?
* Perhaps: given all the raciness, that may be a good or a bad idea.
*/
nr_pages = folio_nr_pages(folio);
for (i = 0; i < nr_pages; i++)
mapcount += atomic_read(&folio_page(folio, i)->_mapcount);
/* But each of those _mapcounts was based on -1 */
mapcount += nr_pages;
return mapcount;
}
commit 5ce1f4844ba0def4b1b5526d8ccea27a98e840e5
Author: David Hildenbrand <david@redhat.com>
Date: Mon Feb 26 15:13:24 2024 +0100
mm: remove total_mapcount()
commit 05c5323b2a344c19c51cd1b91a4ab9ae90853794
Author: David Hildenbrand <david@redhat.com>
Date: Tue Apr 9 21:22:47 2024 +0200
mm: track mapcount of large folios in single value
static inline int folio_mapcount(const struct folio *folio)
{
if (likely(!folio_test_large(folio)))
return atomic_read(&folio->_mapcount) + 1;
return folio_large_mapcount(folio);
}