mirror of
https://github.com/torvalds/linux.git
synced 2024-11-16 00:52:01 +00:00
dc6c9a35b6
Dave noticed that unprivileged process can allocate significant amount of memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PMD page tables. Linux kernel doesn't account PMD tables to the process, only PTE. The use-cases below use few tricks to allocate a lot of PMD page tables while keeping VmRSS and VmPTE low. oom_score for the process will be 0. #include <errno.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/mman.h> #include <sys/prctl.h> #define PUD_SIZE (1UL << 30) #define PMD_SIZE (1UL << 21) #define NR_PUD 130000 int main(void) { char *addr = NULL; unsigned long i; prctl(PR_SET_THP_DISABLE); for (i = 0; i < NR_PUD ; i++) { addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); break; } *addr = 'x'; munmap(addr, PMD_SIZE); mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); if (addr == MAP_FAILED) perror("re-mmap"), exit(1); } printf("PID %d consumed %lu KiB in PMD page tables\n", getpid(), i * 4096 >> 10); return pause(); } The patch addresses the issue by account PMD tables to the process the same way we account PTE. The main place where PMD tables is accounted is __pmd_alloc() and free_pmd_range(). But there're few corner cases: - HugeTLB can share PMD page tables. The patch handles by accounting the table to all processes who share it. - x86 PAE pre-allocates few PMD tables on fork. - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity check on exit(2). Accounting only happens on configuration where PMD page table's level is present (PMD is not folded). As with nr_ptes we use per-mm counter. The counter value is used to calculate baseline for badness score by oom-killer. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reported-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Hugh Dickins <hughd@google.com> Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org> Cc: Pavel Emelyanov <xemul@openvz.org> Cc: David Rientjes <rientjes@google.com> Tested-by: Sedat Dilek <sedat.dilek@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
241 lines
6.5 KiB
C
241 lines
6.5 KiB
C
/*
|
|
* mm/debug.c
|
|
*
|
|
* mm/ specific debug routines.
|
|
*
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/ftrace_event.h>
|
|
#include <linux/memcontrol.h>
|
|
|
|
static const struct trace_print_flags pageflag_names[] = {
|
|
{1UL << PG_locked, "locked" },
|
|
{1UL << PG_error, "error" },
|
|
{1UL << PG_referenced, "referenced" },
|
|
{1UL << PG_uptodate, "uptodate" },
|
|
{1UL << PG_dirty, "dirty" },
|
|
{1UL << PG_lru, "lru" },
|
|
{1UL << PG_active, "active" },
|
|
{1UL << PG_slab, "slab" },
|
|
{1UL << PG_owner_priv_1, "owner_priv_1" },
|
|
{1UL << PG_arch_1, "arch_1" },
|
|
{1UL << PG_reserved, "reserved" },
|
|
{1UL << PG_private, "private" },
|
|
{1UL << PG_private_2, "private_2" },
|
|
{1UL << PG_writeback, "writeback" },
|
|
#ifdef CONFIG_PAGEFLAGS_EXTENDED
|
|
{1UL << PG_head, "head" },
|
|
{1UL << PG_tail, "tail" },
|
|
#else
|
|
{1UL << PG_compound, "compound" },
|
|
#endif
|
|
{1UL << PG_swapcache, "swapcache" },
|
|
{1UL << PG_mappedtodisk, "mappedtodisk" },
|
|
{1UL << PG_reclaim, "reclaim" },
|
|
{1UL << PG_swapbacked, "swapbacked" },
|
|
{1UL << PG_unevictable, "unevictable" },
|
|
#ifdef CONFIG_MMU
|
|
{1UL << PG_mlocked, "mlocked" },
|
|
#endif
|
|
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
|
|
{1UL << PG_uncached, "uncached" },
|
|
#endif
|
|
#ifdef CONFIG_MEMORY_FAILURE
|
|
{1UL << PG_hwpoison, "hwpoison" },
|
|
#endif
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
{1UL << PG_compound_lock, "compound_lock" },
|
|
#endif
|
|
};
|
|
|
|
static void dump_flags(unsigned long flags,
|
|
const struct trace_print_flags *names, int count)
|
|
{
|
|
const char *delim = "";
|
|
unsigned long mask;
|
|
int i;
|
|
|
|
pr_emerg("flags: %#lx(", flags);
|
|
|
|
/* remove zone id */
|
|
flags &= (1UL << NR_PAGEFLAGS) - 1;
|
|
|
|
for (i = 0; i < count && flags; i++) {
|
|
|
|
mask = names[i].mask;
|
|
if ((flags & mask) != mask)
|
|
continue;
|
|
|
|
flags &= ~mask;
|
|
pr_cont("%s%s", delim, names[i].name);
|
|
delim = "|";
|
|
}
|
|
|
|
/* check for left over flags */
|
|
if (flags)
|
|
pr_cont("%s%#lx", delim, flags);
|
|
|
|
pr_cont(")\n");
|
|
}
|
|
|
|
void dump_page_badflags(struct page *page, const char *reason,
|
|
unsigned long badflags)
|
|
{
|
|
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
|
|
page, atomic_read(&page->_count), page_mapcount(page),
|
|
page->mapping, page->index);
|
|
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
|
|
dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
|
|
if (reason)
|
|
pr_alert("page dumped because: %s\n", reason);
|
|
if (page->flags & badflags) {
|
|
pr_alert("bad because of flags:\n");
|
|
dump_flags(page->flags & badflags,
|
|
pageflag_names, ARRAY_SIZE(pageflag_names));
|
|
}
|
|
#ifdef CONFIG_MEMCG
|
|
if (page->mem_cgroup)
|
|
pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
|
|
#endif
|
|
}
|
|
|
|
void dump_page(struct page *page, const char *reason)
|
|
{
|
|
dump_page_badflags(page, reason, 0);
|
|
}
|
|
EXPORT_SYMBOL(dump_page);
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
|
|
|
static const struct trace_print_flags vmaflags_names[] = {
|
|
{VM_READ, "read" },
|
|
{VM_WRITE, "write" },
|
|
{VM_EXEC, "exec" },
|
|
{VM_SHARED, "shared" },
|
|
{VM_MAYREAD, "mayread" },
|
|
{VM_MAYWRITE, "maywrite" },
|
|
{VM_MAYEXEC, "mayexec" },
|
|
{VM_MAYSHARE, "mayshare" },
|
|
{VM_GROWSDOWN, "growsdown" },
|
|
{VM_PFNMAP, "pfnmap" },
|
|
{VM_DENYWRITE, "denywrite" },
|
|
{VM_LOCKED, "locked" },
|
|
{VM_IO, "io" },
|
|
{VM_SEQ_READ, "seqread" },
|
|
{VM_RAND_READ, "randread" },
|
|
{VM_DONTCOPY, "dontcopy" },
|
|
{VM_DONTEXPAND, "dontexpand" },
|
|
{VM_ACCOUNT, "account" },
|
|
{VM_NORESERVE, "noreserve" },
|
|
{VM_HUGETLB, "hugetlb" },
|
|
#if defined(CONFIG_X86)
|
|
{VM_PAT, "pat" },
|
|
#elif defined(CONFIG_PPC)
|
|
{VM_SAO, "sao" },
|
|
#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
|
|
{VM_GROWSUP, "growsup" },
|
|
#elif !defined(CONFIG_MMU)
|
|
{VM_MAPPED_COPY, "mappedcopy" },
|
|
#else
|
|
{VM_ARCH_1, "arch_1" },
|
|
#endif
|
|
{VM_DONTDUMP, "dontdump" },
|
|
#ifdef CONFIG_MEM_SOFT_DIRTY
|
|
{VM_SOFTDIRTY, "softdirty" },
|
|
#endif
|
|
{VM_MIXEDMAP, "mixedmap" },
|
|
{VM_HUGEPAGE, "hugepage" },
|
|
{VM_NOHUGEPAGE, "nohugepage" },
|
|
{VM_MERGEABLE, "mergeable" },
|
|
};
|
|
|
|
void dump_vma(const struct vm_area_struct *vma)
|
|
{
|
|
pr_emerg("vma %p start %p end %p\n"
|
|
"next %p prev %p mm %p\n"
|
|
"prot %lx anon_vma %p vm_ops %p\n"
|
|
"pgoff %lx file %p private_data %p\n",
|
|
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
|
|
vma->vm_prev, vma->vm_mm,
|
|
(unsigned long)pgprot_val(vma->vm_page_prot),
|
|
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
|
|
vma->vm_file, vma->vm_private_data);
|
|
dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
|
|
}
|
|
EXPORT_SYMBOL(dump_vma);
|
|
|
|
void dump_mm(const struct mm_struct *mm)
|
|
{
|
|
pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n"
|
|
#ifdef CONFIG_MMU
|
|
"get_unmapped_area %p\n"
|
|
#endif
|
|
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
|
|
"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
|
|
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
|
|
"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
|
|
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
|
|
"start_brk %lx brk %lx start_stack %lx\n"
|
|
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
|
|
"binfmt %p flags %lx core_state %p\n"
|
|
#ifdef CONFIG_AIO
|
|
"ioctx_table %p\n"
|
|
#endif
|
|
#ifdef CONFIG_MEMCG
|
|
"owner %p "
|
|
#endif
|
|
"exe_file %p\n"
|
|
#ifdef CONFIG_MMU_NOTIFIER
|
|
"mmu_notifier_mm %p\n"
|
|
#endif
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
|
|
#endif
|
|
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
|
|
"tlb_flush_pending %d\n"
|
|
#endif
|
|
"%s", /* This is here to hold the comma */
|
|
|
|
mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
|
|
#ifdef CONFIG_MMU
|
|
mm->get_unmapped_area,
|
|
#endif
|
|
mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
|
|
mm->pgd, atomic_read(&mm->mm_users),
|
|
atomic_read(&mm->mm_count),
|
|
atomic_long_read((atomic_long_t *)&mm->nr_ptes),
|
|
mm_nr_pmds((struct mm_struct *)mm),
|
|
mm->map_count,
|
|
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
|
|
mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
|
|
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
|
|
mm->start_brk, mm->brk, mm->start_stack,
|
|
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
|
|
mm->binfmt, mm->flags, mm->core_state,
|
|
#ifdef CONFIG_AIO
|
|
mm->ioctx_table,
|
|
#endif
|
|
#ifdef CONFIG_MEMCG
|
|
mm->owner,
|
|
#endif
|
|
mm->exe_file,
|
|
#ifdef CONFIG_MMU_NOTIFIER
|
|
mm->mmu_notifier_mm,
|
|
#endif
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
|
|
#endif
|
|
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
|
|
mm->tlb_flush_pending,
|
|
#endif
|
|
"" /* This is here to not have a comma! */
|
|
);
|
|
|
|
dump_flags(mm->def_flags, vmaflags_names,
|
|
ARRAY_SIZE(vmaflags_names));
|
|
}
|
|
|
|
#endif /* CONFIG_DEBUG_VM */
|