From 7b2a2d4a18fffac3c4872021529b0657896db788 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 19 Oct 2012 14:07:31 +0100 Subject: mm: migrate: Add a tracepoint for migrate_pages The pgmigrate_success and pgmigrate_fail vmstat counters tells the user about migration activity but not the type or the reason. This patch adds a tracepoint to identify the type of page migration and why the page is being migrated. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- mm/mempolicy.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d04a8a54c294..66e90ecc2350 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -961,7 +961,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, MIGRATE_SYNC); + false, MIGRATE_SYNC, + MR_SYSCALL); if (err) putback_lru_pages(&pagelist); } @@ -1202,7 +1203,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, - false, MIGRATE_SYNC); + false, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND); if (nr_failed) putback_lru_pages(&pagelist); } -- cgit v1.2.3-70-g09d2 From 479e2802d09f1e18a97262c4c6f8f17ae5884bd8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:28 +0200 Subject: mm: mempolicy: Make MPOL_LOCAL a real policy Make MPOL_LOCAL a real and exposed policy such that applications that relied on the previous default behaviour can explicitly request it. Requested-by: Christoph Lameter Reviewed-by: Rik van Riel Cc: Lee Schermerhorn Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 23e62e0537e2..3e835c9d847b 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -20,6 +20,7 @@ enum { MPOL_PREFERRED, MPOL_BIND, MPOL_INTERLEAVE, + MPOL_LOCAL, MPOL_MAX, /* always last member of enum */ }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 66e90ecc2350..54bd3e5ed776 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -269,6 +269,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); } + } else if (mode == MPOL_LOCAL) { + if (!nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -2399,7 +2403,6 @@ void numa_default_policy(void) * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag * Used only for mpol_parse_str() and mpol_to_str() */ -#define MPOL_LOCAL MPOL_MAX static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", @@ -2452,12 +2455,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (mode = 0; mode <= MPOL_LOCAL; mode++) { + for (mode = 0; mode < MPOL_MAX; mode++) { if (!strcmp(str, policy_modes[mode])) { break; } } - if (mode > MPOL_LOCAL) + if (mode >= MPOL_MAX) goto out; switch (mode) { -- cgit v1.2.3-70-g09d2 From d3a710337b0590f43fd236d5e6518439afc7410a Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Thu, 25 Oct 2012 14:16:29 +0200 Subject: mm: mempolicy: Add MPOL_NOOP This patch augments the MPOL_MF_LAZY feature by adding a "NOOP" policy to mbind(). When the NOOP policy is used with the 'MOVE and 'LAZY flags, mbind() will map the pages PROT_NONE so that they will be migrated on the next touch. This allows an application to prepare for a new phase of operation where different regions of shared storage will be assigned to worker threads, w/o changing policy. Note that we could just use "default" policy in this case. However, this also allows an application to request that pages be migrated, only if necessary, to follow any arbitrary policy that might currently apply to a range of pages, without knowing the policy, or without specifying multiple mbind()s for ranges with different policies. [ Bug in early version of mpol_parse_str() reported by Fengguang Wu. ] Bug-Reported-by: Reported-by: Fengguang Wu Signed-off-by: Lee Schermerhorn Reviewed-by: Rik van Riel Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 3e835c9d847b..d23dca8367cc 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -21,6 +21,7 @@ enum { MPOL_BIND, MPOL_INTERLEAVE, MPOL_LOCAL, + MPOL_NOOP, /* retain existing policy for range */ MPOL_MAX, /* always last member of enum */ }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 54bd3e5ed776..c21e91477c4f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -251,10 +251,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); - if (mode == MPOL_DEFAULT) { + if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); - return NULL; /* simply delete any existing policy */ + return NULL; } VM_BUG_ON(!nodes); @@ -1147,7 +1147,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (start & ~PAGE_MASK) return -EINVAL; - if (mode == MPOL_DEFAULT) + if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) flags &= ~MPOL_MF_STRICT; len = (len + PAGE_SIZE - 1) & PAGE_MASK; @@ -2409,7 +2409,8 @@ static const char * const policy_modes[] = [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", - [MPOL_LOCAL] = "local" + [MPOL_LOCAL] = "local", + [MPOL_NOOP] = "noop", /* should not actually be used */ }; @@ -2460,7 +2461,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) break; } } - if (mode >= MPOL_MAX) + if (mode >= MPOL_MAX || mode == MPOL_NOOP) goto out; switch (mode) { -- cgit v1.2.3-70-g09d2 From 771fb4d806a92bf6c988fcfbd286ae40a9374332 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Thu, 25 Oct 2012 14:16:30 +0200 Subject: mm: mempolicy: Check for misplaced page This patch provides a new function to test whether a page resides on a node that is appropriate for the mempolicy for the vma and address where the page is supposed to be mapped. This involves looking up the node where the page belongs. So, the function returns that node so that it may be used to allocated the page without consulting the policy again. A subsequent patch will call this function from the fault path. Because of this, I don't want to go ahead and allocate the page, e.g., via alloc_page_vma() only to have to free it if it has the correct policy. So, I just mimic the alloc_page_vma() node computation logic--sort of. Note: we could use this function to implement a MPOL_MF_STRICT behavior when migrating pages to match mbind() mempolicy--e.g., to ensure that pages in an interleaved range are reinterleaved rather than left where they are when they reside on any page in the interleave nodemask. Signed-off-by: Lee Schermerhorn Reviewed-by: Rik van Riel Cc: Andrew Morton Cc: Linus Torvalds [ Added MPOL_F_LAZY to trigger migrate-on-fault; simplified code now that we don't have to bother with special crap for interleaved ] Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman --- include/linux/mempolicy.h | 8 +++++ include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 76 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) (limited to 'mm/mempolicy.c') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index e5ccb9ddd90e..c511e2523560 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -198,6 +198,8 @@ static inline int vma_migratable(struct vm_area_struct *vma) return 1; } +extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); + #else struct mempolicy {}; @@ -323,5 +325,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, return 0; } +static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, + unsigned long address) +{ + return -1; /* no node preference */ +} + #endif /* CONFIG_NUMA */ #endif diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index d23dca8367cc..472de8a5d37e 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -61,6 +61,7 @@ enum mpol_rebind_step { #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ +#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c21e91477c4f..df1466d3d2d8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2181,6 +2181,82 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } +/** + * mpol_misplaced - check whether current page node is valid in policy + * + * @page - page to be checked + * @vma - vm area where page mapped + * @addr - virtual address where page mapped + * + * Lookup current policy node id for vma,addr and "compare to" page's + * node id. + * + * Returns: + * -1 - not misplaced, page is in the right node + * node - node id where the page should be + * + * Policy determination "mimics" alloc_page_vma(). + * Called from fault path where we know the vma and faulting address. + */ +int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol; + struct zone *zone; + int curnid = page_to_nid(page); + unsigned long pgoff; + int polnid = -1; + int ret = -1; + + BUG_ON(!vma); + + pol = get_vma_policy(current, vma, addr); + if (!(pol->flags & MPOL_F_MOF)) + goto out; + + switch (pol->mode) { + case MPOL_INTERLEAVE: + BUG_ON(addr >= vma->vm_end); + BUG_ON(addr < vma->vm_start); + + pgoff = vma->vm_pgoff; + pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + polnid = offset_il_node(pol, vma, pgoff); + break; + + case MPOL_PREFERRED: + if (pol->flags & MPOL_F_LOCAL) + polnid = numa_node_id(); + else + polnid = pol->v.preferred_node; + break; + + case MPOL_BIND: + /* + * allows binding to multiple nodes. + * use current page if in policy nodemask, + * else select nearest allowed node, if any. + * If no allowed nodes, use current [!misplaced]. + */ + if (node_isset(curnid, pol->v.nodes)) + goto out; + (void)first_zones_zonelist( + node_zonelist(numa_node_id(), GFP_HIGHUSER), + gfp_zone(GFP_HIGHUSER), + &pol->v.nodes, &zone); + polnid = zone->node; + break; + + default: + BUG(); + } + if (curnid != polnid) + ret = polnid; +out: + mpol_cond_put(pol); + + return ret; +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); -- cgit v1.2.3-70-g09d2 From b24f53a0bea38b266d219ee651b22dba727c44ae Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Thu, 25 Oct 2012 14:16:32 +0200 Subject: mm: mempolicy: Add MPOL_MF_LAZY NOTE: Once again there is a lot of patch stealing and the end result is sufficiently different that I had to drop the signed-offs. Will re-add if the original authors are ok with that. This patch adds another mbind() flag to request "lazy migration". The flag, MPOL_MF_LAZY, modifies MPOL_MF_MOVE* such that the selected pages are marked PROT_NONE. The pages will be migrated in the fault path on "first touch", if the policy dictates at that time. "Lazy Migration" will allow testing of migrate-on-fault via mbind(). Also allows applications to specify that only subsequently touched pages be migrated to obey new policy, instead of all pages in range. This can be useful for multi-threaded applications working on a large shared data area that is initialized by an initial thread resulting in all pages on one [or a few, if overflowed] nodes. After PROT_NONE, the pages in regions assigned to the worker threads will be automatically migrated local to the threads on 1st touch. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- include/linux/mm.h | 5 ++ include/uapi/linux/mempolicy.h | 13 ++- mm/mempolicy.c | 185 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 185 insertions(+), 18 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index fa1615211159..471185e29bab 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1551,6 +1551,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) } #endif +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +void change_prot_numa(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +#endif + struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 472de8a5d37e..6a1baae3775d 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -49,9 +49,16 @@ enum mpol_rebind_step { /* Flags for mbind */ #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ -#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ -#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ -#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */ +#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform + to policy */ +#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */ +#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */ +#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */ + +#define MPOL_MF_VALID (MPOL_MF_STRICT | \ + MPOL_MF_MOVE | \ + MPOL_MF_MOVE_ALL | \ + MPOL_MF_LAZY) /* * Internal flags that share the struct mempolicy flags word with diff --git a/mm/mempolicy.c b/mm/mempolicy.c index df1466d3d2d8..51d3ebd8561e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -90,6 +90,7 @@ #include #include #include +#include #include #include @@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct *vma, return 0; } +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +/* + * Here we search for not shared page mappings (mapcount == 1) and we + * set up the pmd/pte_numa on those mappings so the very next access + * will fire a NUMA hinting page fault. + */ +static int +change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + struct page *page; + unsigned long _address, end; + spinlock_t *ptl; + int ret = 0; + + VM_BUG_ON(address & ~PAGE_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto out; + + if (pmd_trans_huge_lock(pmd, vma) == 1) { + int page_nid; + ret = HPAGE_PMD_NR; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (pmd_numa(*pmd)) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + page = pmd_page(*pmd); + + /* only check non-shared pages */ + if (page_mapcount(page) != 1) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + page_nid = page_to_nid(page); + + if (pmd_numa(*pmd)) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); + ret += HPAGE_PMD_NR; + /* defer TLB flush to lower the overhead */ + spin_unlock(&mm->page_table_lock); + goto out; + } + + if (pmd_trans_unstable(pmd)) + goto out; + VM_BUG_ON(!pmd_present(*pmd)); + + end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK); + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _address < end; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (!pte_present(pteval)) + continue; + if (pte_numa(pteval)) + continue; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + continue; + /* only check non-shared pages */ + if (page_mapcount(page) != 1) + continue; + + set_pte_at(mm, _address, _pte, pte_mknuma(pteval)); + + /* defer TLB flush to lower the overhead */ + ret++; + } + pte_unmap_unlock(pte, ptl); + + if (ret && !pmd_numa(*pmd)) { + spin_lock(&mm->page_table_lock); + set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); + spin_unlock(&mm->page_table_lock); + /* defer TLB flush to lower the overhead */ + } + +out: + return ret; +} + +/* Assumes mmap_sem is held */ +void +change_prot_numa(struct vm_area_struct *vma, + unsigned long address, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + int progress = 0; + + while (address < end) { + VM_BUG_ON(address < vma->vm_start || + address + PAGE_SIZE > vma->vm_end); + + progress += change_prot_numa_range(mm, vma, address); + address = (address + PMD_SIZE) & PMD_MASK; + } + + /* + * Flush the TLB for the mm to start the NUMA hinting + * page faults after we finish scanning this vma part + * if there were any PTE updates + */ + if (progress) { + mmu_notifier_invalidate_range_start(vma->vm_mm, address, end); + flush_tlb_range(vma, address, end); + mmu_notifier_invalidate_range_end(vma->vm_mm, address, end); + } +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + return 0; +} +#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ + /* * Check if all pages in a range are on a set of nodes. * If pagelist != NULL then isolate pages from the LRU and @@ -583,22 +723,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + unsigned long endvma = vma->vm_end; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) return ERR_PTR(-EFAULT); if (prev && prev->vm_end < vma->vm_start) return ERR_PTR(-EFAULT); } - if (!is_vm_hugetlb_page(vma) && - ((flags & MPOL_MF_STRICT) || + + if (is_vm_hugetlb_page(vma)) + goto next; + + if (flags & MPOL_MF_LAZY) { + change_prot_numa(vma, start, endvma); + goto next; + } + + if ((flags & MPOL_MF_STRICT) || ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma)))) { - unsigned long endvma = vma->vm_end; + vma_migratable(vma))) { - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; err = check_pgd_range(vma, start, endvma, nodes, flags, private); if (err) { @@ -606,6 +756,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, break; } } +next: prev = vma; } return first; @@ -1138,8 +1289,7 @@ static long do_mbind(unsigned long start, unsigned long len, int err; LIST_HEAD(pagelist); - if (flags & ~(unsigned long)(MPOL_MF_STRICT | - MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; @@ -1162,6 +1312,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); + if (flags & MPOL_MF_LAZY) + new->flags |= MPOL_F_MOF; + /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1198,13 +1351,15 @@ static long do_mbind(unsigned long start, unsigned long len, vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - err = PTR_ERR(vma); - if (!IS_ERR(vma)) { - int nr_failed = 0; - + err = PTR_ERR(vma); /* maybe ... */ + if (!IS_ERR(vma) && mode != MPOL_NOOP) err = mbind_range(mm, start, end, new); + if (!err) { + int nr_failed = 0; + if (!list_empty(&pagelist)) { + WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, false, MIGRATE_SYNC, @@ -1213,7 +1368,7 @@ static long do_mbind(unsigned long start, unsigned long len, putback_lru_pages(&pagelist); } - if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else putback_lru_pages(&pagelist); -- cgit v1.2.3-70-g09d2 From 4b10e7d562c90d0a72f324832c26653947a07381 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 25 Oct 2012 14:16:32 +0200 Subject: mm: mempolicy: Implement change_prot_numa() in terms of change_protection() This patch converts change_prot_numa() to use change_protection(). As pte_numa and friends check the PTE bits directly it is necessary for change_protection() to use pmd_mknuma(). Hence the required modifications to change_protection() are a little clumsy but the end result is that most of the numa page table helpers are just one or two instructions. Signed-off-by: Mel Gorman --- include/linux/huge_mm.h | 3 +- include/linux/mm.h | 4 +- mm/huge_memory.c | 14 ++++- mm/mempolicy.c | 137 +++++------------------------------------------- mm/mprotect.c | 72 +++++++++++++++++++------ 5 files changed, 85 insertions(+), 145 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index dabb5108d6c0..027ad04ef3a8 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -27,7 +27,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot); + unsigned long addr, pgprot_t newprot, + int prot_numa); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/include/linux/mm.h b/include/linux/mm.h index 471185e29bab..d04c2f0aab36 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1080,7 +1080,7 @@ extern unsigned long do_mremap(unsigned long addr, unsigned long flags, unsigned long new_addr); extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable); + int dirty_accountable, int prot_numa); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); @@ -1552,7 +1552,7 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) #endif #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE -void change_prot_numa(struct vm_area_struct *vma, +unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5723b551c023..d79f7a55bf6f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1147,7 +1147,7 @@ out: } int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot) + unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; int ret = 0; @@ -1155,7 +1155,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma) == 1) { pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); - entry = pmd_modify(entry, newprot); + if (!prot_numa) + entry = pmd_modify(entry, newprot); + else { + struct page *page = pmd_page(*pmd); + + /* only check non-shared pages */ + if (page_mapcount(page) == 1 && + !pmd_numa(*pmd)) { + entry = pmd_mknuma(entry); + } + } set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); ret = 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 51d3ebd8561e..75d4600a5e92 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -568,134 +568,23 @@ static inline int check_pgd_range(struct vm_area_struct *vma, #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE /* - * Here we search for not shared page mappings (mapcount == 1) and we - * set up the pmd/pte_numa on those mappings so the very next access - * will fire a NUMA hinting page fault. + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core. */ -static int -change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, *_pte; - struct page *page; - unsigned long _address, end; - spinlock_t *ptl; - int ret = 0; - - VM_BUG_ON(address & ~PAGE_MASK); - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto out; - - if (pmd_trans_huge_lock(pmd, vma) == 1) { - int page_nid; - ret = HPAGE_PMD_NR; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - - if (pmd_numa(*pmd)) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - page = pmd_page(*pmd); - - /* only check non-shared pages */ - if (page_mapcount(page) != 1) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - page_nid = page_to_nid(page); - - if (pmd_numa(*pmd)) { - spin_unlock(&mm->page_table_lock); - goto out; - } - - set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); - ret += HPAGE_PMD_NR; - /* defer TLB flush to lower the overhead */ - spin_unlock(&mm->page_table_lock); - goto out; - } - - if (pmd_trans_unstable(pmd)) - goto out; - VM_BUG_ON(!pmd_present(*pmd)); - - end = min(vma->vm_end, (address + PMD_SIZE) & PMD_MASK); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - for (_address = address, _pte = pte; _address < end; - _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; - if (!pte_present(pteval)) - continue; - if (pte_numa(pteval)) - continue; - page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) - continue; - /* only check non-shared pages */ - if (page_mapcount(page) != 1) - continue; - - set_pte_at(mm, _address, _pte, pte_mknuma(pteval)); - - /* defer TLB flush to lower the overhead */ - ret++; - } - pte_unmap_unlock(pte, ptl); - - if (ret && !pmd_numa(*pmd)) { - spin_lock(&mm->page_table_lock); - set_pmd_at(mm, address, pmd, pmd_mknuma(*pmd)); - spin_unlock(&mm->page_table_lock); - /* defer TLB flush to lower the overhead */ - } - -out: - return ret; -} - -/* Assumes mmap_sem is held */ -void -change_prot_numa(struct vm_area_struct *vma, - unsigned long address, unsigned long end) +unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { - struct mm_struct *mm = vma->vm_mm; - int progress = 0; - - while (address < end) { - VM_BUG_ON(address < vma->vm_start || - address + PAGE_SIZE > vma->vm_end); + int nr_updated; + BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); - progress += change_prot_numa_range(mm, vma, address); - address = (address + PMD_SIZE) & PMD_MASK; - } + nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); - /* - * Flush the TLB for the mm to start the NUMA hinting - * page faults after we finish scanning this vma part - * if there were any PTE updates - */ - if (progress) { - mmu_notifier_invalidate_range_start(vma->vm_mm, address, end); - flush_tlb_range(vma, address, end); - mmu_notifier_invalidate_range_end(vma->vm_mm, address, end); - } + return nr_updated; } #else static unsigned long change_prot_numa(struct vm_area_struct *vma, diff --git a/mm/mprotect.c b/mm/mprotect.c index 7c3628a8b486..7ef6ae964e8f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -35,10 +35,11 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif -static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { + struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; @@ -49,19 +50,39 @@ static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; + bool updated = false; ptent = ptep_modify_prot_start(mm, addr, pte); - ptent = pte_modify(ptent, newprot); + if (!prot_numa) { + ptent = pte_modify(ptent, newprot); + updated = true; + } else { + struct page *page; + + page = vm_normal_page(vma, addr, oldpte); + if (page) { + /* only check non-shared pages */ + if (!pte_numa(oldpte) && + page_mapcount(page) == 1) { + ptent = pte_mknuma(ptent); + updated = true; + } + } + } /* * Avoid taking write faults for pages we know to be * dirty. */ - if (dirty_accountable && pte_dirty(ptent)) + if (dirty_accountable && pte_dirty(ptent)) { ptent = pte_mkwrite(ptent); + updated = true; + } + + if (updated) + pages++; ptep_modify_prot_commit(mm, addr, pte, ptent); - pages++; } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -83,9 +104,25 @@ static unsigned long change_pte_range(struct mm_struct *mm, pmd_t *pmd, return pages; } +#ifdef CONFIG_NUMA_BALANCING +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + spin_lock(&mm->page_table_lock); + set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); + spin_unlock(&mm->page_table_lock); +} +#else +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pmd_t *pmd; unsigned long next; @@ -97,7 +134,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma->vm_mm, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot)) { + else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { pages += HPAGE_PMD_NR; continue; } @@ -105,8 +142,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * } if (pmd_none_or_clear_bad(pmd)) continue; - pages += change_pte_range(vma->vm_mm, pmd, addr, next, newprot, - dirty_accountable); + pages += change_pte_range(vma, pmd, addr, next, newprot, + dirty_accountable, prot_numa); + + if (prot_numa) + change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); return pages; @@ -114,7 +154,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t * static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; @@ -126,7 +166,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t * if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable); + dirty_accountable, prot_numa); } while (pud++, addr = next, addr != end); return pages; @@ -134,7 +174,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t * static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -150,7 +190,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, if (pgd_none_or_clear_bad(pgd)) continue; pages += change_pud_range(vma, pgd, addr, next, newprot, - dirty_accountable); + dirty_accountable, prot_numa); } while (pgd++, addr = next, addr != end); /* Only flush the TLB if we actually modified any entries: */ @@ -162,7 +202,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; unsigned long pages; @@ -171,7 +211,7 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else - pages = change_protection_range(vma, start, end, newprot, dirty_accountable); + pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); mmu_notifier_invalidate_range_end(mm, start, end); return pages; @@ -249,7 +289,7 @@ success: dirty_accountable = 1; } - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); -- cgit v1.2.3-70-g09d2 From a720094ded8cbb303111035be91858011d2eac71 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 16 Nov 2012 09:37:58 +0000 Subject: mm: mempolicy: Hide MPOL_NOOP and MPOL_MF_LAZY from userspace for now The use of MPOL_NOOP and MPOL_MF_LAZY to allow an application to explicitly request lazy migration is a good idea but the actual API has not been well reviewed and once released we have to support it. For now this patch prevents an application using the services. This will need to be revisited. Signed-off-by: Mel Gorman --- include/uapi/linux/mempolicy.h | 4 +--- mm/mempolicy.c | 9 ++++----- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 6a1baae3775d..16fb4e6efbc4 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -21,7 +21,6 @@ enum { MPOL_BIND, MPOL_INTERLEAVE, MPOL_LOCAL, - MPOL_NOOP, /* retain existing policy for range */ MPOL_MAX, /* always last member of enum */ }; @@ -57,8 +56,7 @@ enum mpol_rebind_step { #define MPOL_MF_VALID (MPOL_MF_STRICT | \ MPOL_MF_MOVE | \ - MPOL_MF_MOVE_ALL | \ - MPOL_MF_LAZY) + MPOL_MF_MOVE_ALL) /* * Internal flags that share the struct mempolicy flags word with diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 75d4600a5e92..a7a62fe7c280 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -252,7 +252,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); - if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) { + if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; @@ -1186,7 +1186,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (start & ~PAGE_MASK) return -EINVAL; - if (mode == MPOL_DEFAULT || mode == MPOL_NOOP) + if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = (len + PAGE_SIZE - 1) & PAGE_MASK; @@ -1241,7 +1241,7 @@ static long do_mbind(unsigned long start, unsigned long len, flags | MPOL_MF_INVERT, &pagelist); err = PTR_ERR(vma); /* maybe ... */ - if (!IS_ERR(vma) && mode != MPOL_NOOP) + if (!IS_ERR(vma)) err = mbind_range(mm, start, end, new); if (!err) { @@ -2530,7 +2530,6 @@ static const char * const policy_modes[] = [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", - [MPOL_NOOP] = "noop", /* should not actually be used */ }; @@ -2581,7 +2580,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) break; } } - if (mode >= MPOL_MAX || mode == MPOL_NOOP) + if (mode >= MPOL_MAX) goto out; switch (mode) { -- cgit v1.2.3-70-g09d2 From 03c5a6e16322c997bf8f264851bfa3f532ad515f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Nov 2012 14:52:48 +0000 Subject: mm: numa: Add pte updates, hinting and migration stats It is tricky to quantify the basic cost of automatic NUMA placement in a meaningful manner. This patch adds some vmstats that can be used as part of a basic costing model. u = basic unit = sizeof(void *) Ca = cost of struct page access = sizeof(struct page) / u Cpte = Cost PTE access = Ca Cupdate = Cost PTE update = (2 * Cpte) + (2 * Wlock) where Cpte is incurred twice for a read and a write and Wlock is a constant representing the cost of taking or releasing a lock Cnumahint = Cost of a minor page fault = some high constant e.g. 1000 Cpagerw = Cost to read or write a full page = Ca + PAGE_SIZE/u Ci = Cost of page isolation = Ca + Wi where Wi is a constant that should reflect the approximate cost of the locking operation Cpagecopy = Cpagerw + (Cpagerw * Wnuma) + Ci + (Ci * Wnuma) where Wnuma is the approximate NUMA factor. 1 is local. 1.2 would imply that remote accesses are 20% more expensive Balancing cost = Cpte * numa_pte_updates + Cnumahint * numa_hint_faults + Ci * numa_pages_migrated + Cpagecopy * numa_pages_migrated Note that numa_pages_migrated is used as a measure of how many pages were isolated even though it would miss pages that failed to migrate. A vmstat counter could have been added for it but the isolation cost is pretty marginal in comparison to the overall cost so it seemed overkill. The ideal way to measure automatic placement benefit would be to count the number of remote accesses versus local accesses and do something like benefit = (remote_accesses_before - remove_access_after) * Wnuma but the information is not readily available. As a workload converges, the expection would be that the number of remote numa hints would reduce to 0. convergence = numa_hint_faults_local / numa_hint_faults where this is measured for the last N number of numa hints recorded. When the workload is fully converged the value is 1. This can measure if the placement policy is converging and how fast it is doing it. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- include/linux/vm_event_item.h | 6 ++++++ include/linux/vmstat.h | 8 ++++++++ mm/huge_memory.c | 5 +++++ mm/memory.c | 12 ++++++++++++ mm/mempolicy.c | 2 ++ mm/migrate.c | 3 ++- mm/vmstat.c | 6 ++++++ 7 files changed, 41 insertions(+), 1 deletion(-) (limited to 'mm/mempolicy.c') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a1f750b8e72a..55600049e794 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -38,6 +38,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, KSWAPD_SKIP_CONGESTION_WAIT, PAGEOUTRUN, ALLOCSTALL, PGROTATED, +#ifdef CONFIG_NUMA_BALANCING + NUMA_PTE_UPDATES, + NUMA_HINT_FAULTS, + NUMA_HINT_FAULTS_LOCAL, + NUMA_PAGE_MIGRATE, +#endif #ifdef CONFIG_MIGRATION PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, #endif diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 92a86b2cce33..a13291f7da88 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -80,6 +80,14 @@ static inline void vm_events_fold_cpu(int cpu) #endif /* CONFIG_VM_EVENT_COUNTERS */ +#ifdef CONFIG_NUMA_BALANCING +#define count_vm_numa_event(x) count_vm_event(x) +#define count_vm_numa_events(x, y) count_vm_events(x, y) +#else +#define count_vm_numa_event(x) do {} while (0) +#define count_vm_numa_events(x, y) do {} while (0) +#endif /* CONFIG_NUMA_BALANCING */ + #define __count_zone_vm_events(item, zone, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ zone_idx(zone), delta) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ee8133794a56..f3a477fffd09 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1026,6 +1026,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; + int current_nid = -1; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1034,6 +1035,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); get_page(page); spin_unlock(&mm->page_table_lock); + current_nid = page_to_nid(page); + count_vm_numa_event(NUMA_HINT_FAULTS); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); target_nid = mpol_misplaced(page, vma, haddr); if (target_nid == -1) diff --git a/mm/memory.c b/mm/memory.c index 8012c1907895..8a7b4ccbe136 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3477,6 +3477,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); + count_vm_numa_event(NUMA_HINT_FAULTS); page = vm_normal_page(vma, addr, pte); if (!page) { pte_unmap_unlock(ptep, ptl); @@ -3485,6 +3486,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, get_page(page); current_nid = page_to_nid(page); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); target_nid = mpol_misplaced(page, vma, addr); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { @@ -3517,6 +3520,9 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long offset; spinlock_t *ptl; bool numa = false; + int local_nid = numa_node_id(); + unsigned long nr_faults = 0; + unsigned long nr_faults_local = 0; spin_lock(&mm->page_table_lock); pmd = *pmdp; @@ -3565,10 +3571,16 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, curr_nid = page_to_nid(page); task_numa_fault(curr_nid, 1); + nr_faults++; + if (curr_nid == local_nid) + nr_faults_local++; + pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } pte_unmap_unlock(orig_pte, ptl); + count_vm_numa_events(NUMA_HINT_FAULTS, nr_faults); + count_vm_numa_events(NUMA_HINT_FAULTS_LOCAL, nr_faults_local); return 0; } #else diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a7a62fe7c280..516491fbfaa8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -583,6 +583,8 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); + if (nr_updated) + count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); return nr_updated; } diff --git a/mm/migrate.c b/mm/migrate.c index c7d550011a64..23bba5d6edff 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1514,7 +1514,8 @@ int migrate_misplaced_page(struct page *page, int node) if (nr_remaining) { putback_lru_pages(&migratepages); isolated = 0; - } + } else + count_vm_numa_event(NUMA_PAGE_MIGRATE); } BUG_ON(!list_empty(&migratepages)); out: diff --git a/mm/vmstat.c b/mm/vmstat.c index 3a067fabe190..c0f1f6db5182 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -774,6 +774,12 @@ const char * const vmstat_text[] = { "pgrotated", +#ifdef CONFIG_NUMA_BALANCING + "numa_pte_updates", + "numa_hint_faults", + "numa_hint_faults_local", + "numa_pages_migrated", +#endif #ifdef CONFIG_MIGRATION "pgmigrate_success", "pgmigrate_fail", -- cgit v1.2.3-70-g09d2 From 5606e3877ad8baea42f3a71ebde0a03622bbb551 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 2 Nov 2012 18:19:13 +0000 Subject: mm: numa: Migrate on reference policy This is the simplest possible policy that still does something of note. When a pte_numa is faulted, it is moved immediately. Any replacement policy must at least do better than this and in all likelihood this policy regresses normal workloads. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 38 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 16fb4e6efbc4..0d11c3dcd3a1 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -67,6 +67,7 @@ enum mpol_rebind_step { #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ +#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 516491fbfaa8..4c1c8d83ac6a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -118,6 +118,26 @@ static struct mempolicy default_policy = { .flags = MPOL_F_LOCAL, }; +static struct mempolicy preferred_node_policy[MAX_NUMNODES]; + +static struct mempolicy *get_task_policy(struct task_struct *p) +{ + struct mempolicy *pol = p->mempolicy; + int node; + + if (!pol) { + node = numa_node_id(); + if (node != -1) + pol = &preferred_node_policy[node]; + + /* preferred_node_policy is not initialised early in boot */ + if (!pol->mode) + pol = NULL; + } + + return pol; +} + static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); /* @@ -1598,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = task->mempolicy; + struct mempolicy *pol = get_task_policy(task); if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { @@ -2021,7 +2041,7 @@ retry_cpuset: */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = get_task_policy(current); struct page *page; unsigned int cpuset_mems_cookie; @@ -2295,6 +2315,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long default: BUG(); } + + /* Migrate the page towards the node whose CPU is referencing it */ + if (pol->flags & MPOL_F_MORON) + polnid = numa_node_id(); + if (curnid != polnid) ret = polnid; out: @@ -2483,6 +2508,15 @@ void __init numa_policy_init(void) sizeof(struct sp_node), 0, SLAB_PANIC, NULL); + for_each_node(nid) { + preferred_node_policy[nid] = (struct mempolicy) { + .refcnt = ATOMIC_INIT(1), + .mode = MPOL_PREFERRED, + .flags = MPOL_F_MOF | MPOL_F_MORON, + .v = { .preferred_node = nid, }, + }; + } + /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or -- cgit v1.2.3-70-g09d2 From e42c8ff2999de1239a57d434bfbd8e9f2a56e814 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 12 Nov 2012 09:17:07 +0000 Subject: mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships Note: This two-stage filter was taken directly from the sched/numa patch "sched, numa, mm: Add the scanning page fault machinery" but is only a partial extraction. As the end result is not necessarily recognisable, the signed-offs-by had to be removed. Will be added back if requested. While it is desirable that all threads in a process run on its home node, this is not always possible or necessary. There may be more threads than exist within the node or the node might over-subscribed with unrelated processes. This can cause a situation whereby a page gets migrated off its home node because the threads clearing pte_numa were running off-node. This patch uses page->last_nid to build a two-stage filter before pages get migrated to avoid problems with short or unlikely task<->node relationships. Signed-off-by: Mel Gorman --- mm/mempolicy.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4c1c8d83ac6a..fd20e28fd2ad 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2317,9 +2317,37 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long } /* Migrate the page towards the node whose CPU is referencing it */ - if (pol->flags & MPOL_F_MORON) + if (pol->flags & MPOL_F_MORON) { + int last_nid; + polnid = numa_node_id(); + /* + * Multi-stage node selection is used in conjunction + * with a periodic migration fault to build a temporal + * task<->page relation. By using a two-stage filter we + * remove short/unlikely relations. + * + * Using P(p) ~ n_p / n_t as per frequentist + * probability, we can equate a task's usage of a + * particular page (n_p) per total usage of this + * page (n_t) (in a given time-span) to a probability. + * + * Our periodic faults will sample this probability and + * getting the same result twice in a row, given these + * samples are fully independent, is then given by + * P(n)^2, provided our sample period is sufficiently + * short compared to the usage pattern. + * + * This quadric squishes small probabilities, making + * it less likely we act on an unlikely task<->page + * relation. + */ + last_nid = page_xchg_last_nid(page, polnid); + if (last_nid != polnid) + goto out; + } + if (curnid != polnid) ret = polnid; out: -- cgit v1.2.3-70-g09d2 From 1a687c2e9a99335c9e77392f050fe607fa18a652 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 22 Nov 2012 11:16:36 +0000 Subject: mm: sched: numa: Control enabling and disabling of NUMA balancing This patch adds Kconfig options and kernel parameters to allow the enabling and disabling of automatic NUMA balancing. The existance of such a switch was and is very important when debugging problems related to transparent hugepages and we should have the same for automatic NUMA placement. Signed-off-by: Mel Gorman --- Documentation/kernel-parameters.txt | 3 +++ include/linux/sched.h | 4 ++++ init/Kconfig | 8 +++++++ kernel/sched/core.c | 48 +++++++++++++++++++++++++------------ kernel/sched/fair.c | 3 +++ kernel/sched/features.h | 6 +++-- mm/mempolicy.c | 46 +++++++++++++++++++++++++++++++++++ 7 files changed, 101 insertions(+), 17 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9776f068306b..2e8d2625b814 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1996,6 +1996,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nr_uarts= [SERIAL] maximum number of UARTs to be registered. + numa_balancing= [KNL,X86] Enable or disable automatic NUMA balancing. + Allowed values are enable and disable + numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. one of ['zone', 'node', 'default'] can be specified This can be set from sysctl after boot. diff --git a/include/linux/sched.h b/include/linux/sched.h index 0f4ff2bd03f6..b1e619f9ff1a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1563,10 +1563,14 @@ struct task_struct { #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int node, int pages, bool migrated); +extern void set_numabalancing_state(bool enabled); #else static inline void task_numa_fault(int node, int pages, bool migrated) { } +static inline void set_numabalancing_state(bool enabled) +{ +} #endif /* diff --git a/init/Kconfig b/init/Kconfig index 9f00f004796a..18e2a5920a34 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -720,6 +720,14 @@ config ARCH_USES_NUMA_PROT_NONE depends on ARCH_WANTS_PROT_NUMA_PROT_NONE depends on NUMA_BALANCING +config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y + depends on NUMA_BALANCING + help + If set, autonumic NUMA balancing will be enabled if running on a NUMA + machine. + config NUMA_BALANCING bool "Memory placement aware NUMA scheduler" default y diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d255bc0e278..7a45015274ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -192,23 +192,10 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* HAVE_JUMP_LABEL */ -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int sched_feat_set(char *cmp) { - char buf[64]; - char *cmp; - int neg = 0; int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); + int neg = 0; if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; @@ -228,6 +215,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } } + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + i = sched_feat_set(cmp); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -1549,6 +1557,16 @@ static void __sched_fork(struct task_struct *p) #endif /* CONFIG_NUMA_BALANCING */ } +#ifdef CONFIG_NUMA_BALANCING +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sched_feat_set("NUMA"); + else + sched_feat_set("NO_NUMA"); +} +#endif /* CONFIG_NUMA_BALANCING */ + /* * fork()/clone()-time setup: */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4b577863933f..7a02a2082e95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -811,6 +811,9 @@ void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; + if (!sched_feat_numa(NUMA)) + return; + /* FIXME: Allocate task-specific structure for placement policy here */ /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5fb7aefbec80..d2373a3e3252 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -63,8 +63,10 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) /* - * Apply the automatic NUMA scheduling policy + * Apply the automatic NUMA scheduling policy. Enabled automatically + * at runtime if running on a NUMA machine. Can be controlled via + * numa_balancing= */ #ifdef CONFIG_NUMA_BALANCING -SCHED_FEAT(NUMA, true) +SCHED_FEAT(NUMA, false) #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fd20e28fd2ad..046308e9b999 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2521,6 +2521,50 @@ void mpol_free_shared_policy(struct shared_policy *p) mutex_unlock(&p->mutex); } +#ifdef CONFIG_NUMA_BALANCING +static bool __initdata numabalancing_override; + +static void __init check_numabalancing_enable(void) +{ + bool numabalancing_default = false; + + if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) + numabalancing_default = true; + + if (nr_node_ids > 1 && !numabalancing_override) { + printk(KERN_INFO "Enabling automatic NUMA balancing. " + "Configure with numa_balancing= or sysctl"); + set_numabalancing_state(numabalancing_default); + } +} + +static int __init setup_numabalancing(char *str) +{ + int ret = 0; + if (!str) + goto out; + numabalancing_override = true; + + if (!strcmp(str, "enable")) { + set_numabalancing_state(true); + ret = 1; + } else if (!strcmp(str, "disable")) { + set_numabalancing_state(false); + ret = 1; + } +out: + if (!ret) + printk(KERN_WARNING "Unable to parse numa_balancing=\n"); + + return ret; +} +__setup("numa_balancing=", setup_numabalancing); +#else +static inline void __init check_numabalancing_enable(void) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { @@ -2571,6 +2615,8 @@ void __init numa_policy_init(void) if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); + + check_numabalancing_enable(); } /* Reset policy of current process to default */ -- cgit v1.2.3-70-g09d2