Skip to content

Commit 3974388

Browse files
Christoph LameterLinus Torvalds
Christoph Lameter
authored and
Linus Torvalds
committed
[PATCH] Swap Migration V5: sys_migrate_pages interface
sys_migrate_pages implementation using swap based page migration This is the original API proposed by Ray Bryant in his posts during the first half of 2005 on [email protected] and [email protected]. The intent of sys_migrate is to migrate memory of a process. A process may have migrated to another node. Memory was allocated optimally for the prior context. sys_migrate_pages allows to shift the memory to the new node. sys_migrate_pages is also useful if the processes available memory nodes have changed through cpuset operations to manually move the processes memory. Paul Jackson is working on an automated mechanism that will allow an automatic migration if the cpuset of a process is changed. However, a user may decide to manually control the migration. This implementation is put into the policy layer since it uses concepts and functions that are also needed for mbind and friends. The patch also provides a do_migrate_pages function that may be useful for cpusets to automatically move memory. sys_migrate_pages does not modify policies in contrast to Ray's implementation. The current code here is based on the swap based page migration capability and thus is not able to preserve the physical layout relative to it containing nodeset (which may be a cpuset). When direct page migration becomes available then the implementation needs to be changed to do a isomorphic move of pages between different nodesets. The current implementation simply evicts all pages in source nodeset that are not in the target nodeset. Patch supports ia64, i386 and x86_64. Signed-off-by: Christoph Lameter <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent dc9aa5b commit 3974388

File tree

11 files changed

+111
-5
lines changed

11 files changed

+111
-5
lines changed

arch/i386/kernel/syscall_table.S

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,3 +293,4 @@ ENTRY(sys_call_table)
293293
.long sys_inotify_init
294294
.long sys_inotify_add_watch
295295
.long sys_inotify_rm_watch
296+
.long sys_migrate_pages

arch/ia64/kernel/entry.S

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1600,5 +1600,6 @@ sys_call_table:
16001600
data8 sys_inotify_init
16011601
data8 sys_inotify_add_watch
16021602
data8 sys_inotify_rm_watch
1603+
data8 sys_migrate_pages // 1280
16031604

16041605
.org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls

arch/x86_64/ia32/ia32entry.S

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,7 @@ ia32_sys_call_table:
643643
.quad sys_inotify_init
644644
.quad sys_inotify_add_watch
645645
.quad sys_inotify_rm_watch
646+
.quad sys_migrate_pages
646647
ia32_syscall_end:
647648
.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
648649
.quad ni_syscall

include/asm-i386/unistd.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,8 +299,9 @@
299299
#define __NR_inotify_init 291
300300
#define __NR_inotify_add_watch 292
301301
#define __NR_inotify_rm_watch 293
302+
#define __NR_migrate_pages 294
302303

303-
#define NR_syscalls 294
304+
#define NR_syscalls 295
304305

305306
/*
306307
* user-visible error numbers are in the range -1 - -128: see

include/asm-ia64/unistd.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,12 +269,13 @@
269269
#define __NR_inotify_init 1277
270270
#define __NR_inotify_add_watch 1278
271271
#define __NR_inotify_rm_watch 1279
272+
#define __NR_migrate_pages 1280
272273

273274
#ifdef __KERNEL__
274275

275276
#include <linux/config.h>
276277

277-
#define NR_syscalls 256 /* length of syscall table */
278+
#define NR_syscalls 270 /* length of syscall table */
278279

279280
#define __ARCH_WANT_SYS_RT_SIGACTION
280281

include/asm-x86_64/ia32_unistd.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,8 @@
299299
#define __NR_ia32_inotify_init 291
300300
#define __NR_ia32_inotify_add_watch 292
301301
#define __NR_ia32_inotify_rm_watch 293
302+
#define __NR_ia32_migrate_pages 294
302303

303-
#define IA32_NR_syscalls 294 /* must be > than biggest syscall! */
304+
#define IA32_NR_syscalls 295 /* must be > than biggest syscall! */
304305

305306
#endif /* _ASM_X86_64_IA32_UNISTD_H_ */

include/asm-x86_64/unistd.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,8 +571,10 @@ __SYSCALL(__NR_inotify_init, sys_inotify_init)
571571
__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
572572
#define __NR_inotify_rm_watch 255
573573
__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
574+
#define __NR_migrate_pages 256
575+
__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
574576

575-
#define __NR_syscall_max __NR_inotify_rm_watch
577+
#define __NR_syscall_max __NR_migrate_pages
576578
#ifndef __NO_STUBS
577579

578580
/* user-visible error numbers are in the range -1 - -4095 */

include/linux/mempolicy.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,9 @@ static inline void check_highest_zone(int k)
162162
policy_zone = k;
163163
}
164164

165+
int do_migrate_pages(struct mm_struct *mm,
166+
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
167+
165168
#else
166169

167170
struct mempolicy {};

include/linux/syscalls.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,5 +511,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
511511
asmlinkage long sys_ioprio_get(int which, int who);
512512
asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
513513
unsigned long maxnode);
514+
asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
515+
const unsigned long __user *from, const unsigned long __user *to);
514516

515517
#endif

kernel/sys_ni.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ cond_syscall(compat_sys_socketcall);
8282
cond_syscall(sys_inotify_init);
8383
cond_syscall(sys_inotify_add_watch);
8484
cond_syscall(sys_inotify_rm_watch);
85+
cond_syscall(sys_migrate_pages);
8586

8687
/* arch-specific weak syscall entries */
8788
cond_syscall(sys_pciconfig_read);

mm/mempolicy.c

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -614,12 +614,42 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
614614
return err;
615615
}
616616

617+
/*
618+
* For now migrate_pages simply swaps out the pages from nodes that are in
619+
* the source set but not in the target set. In the future, we would
620+
* want a function that moves pages between the two nodesets in such
621+
* a way as to preserve the physical layout as much as possible.
622+
*
623+
* Returns the number of page that could not be moved.
624+
*/
625+
int do_migrate_pages(struct mm_struct *mm,
626+
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
627+
{
628+
LIST_HEAD(pagelist);
629+
int count = 0;
630+
nodemask_t nodes;
631+
632+
nodes_andnot(nodes, *from_nodes, *to_nodes);
633+
nodes_complement(nodes, nodes);
634+
635+
down_read(&mm->mmap_sem);
636+
check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
637+
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
638+
if (!list_empty(&pagelist)) {
639+
migrate_pages(&pagelist, NULL);
640+
if (!list_empty(&pagelist))
641+
count = putback_lru_pages(&pagelist);
642+
}
643+
up_read(&mm->mmap_sem);
644+
return count;
645+
}
646+
617647
/*
618648
* User space interface with variable sized bitmaps for nodelists.
619649
*/
620650

621651
/* Copy a node mask from user space. */
622-
static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
652+
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
623653
unsigned long maxnode)
624654
{
625655
unsigned long k;
@@ -708,6 +738,68 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
708738
return do_set_mempolicy(mode, &nodes);
709739
}
710740

741+
/* Macro needed until Paul implements this function in kernel/cpusets.c */
742+
#define cpuset_mems_allowed(task) node_online_map
743+
744+
asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
745+
const unsigned long __user *old_nodes,
746+
const unsigned long __user *new_nodes)
747+
{
748+
struct mm_struct *mm;
749+
struct task_struct *task;
750+
nodemask_t old;
751+
nodemask_t new;
752+
nodemask_t task_nodes;
753+
int err;
754+
755+
err = get_nodes(&old, old_nodes, maxnode);
756+
if (err)
757+
return err;
758+
759+
err = get_nodes(&new, new_nodes, maxnode);
760+
if (err)
761+
return err;
762+
763+
/* Find the mm_struct */
764+
read_lock(&tasklist_lock);
765+
task = pid ? find_task_by_pid(pid) : current;
766+
if (!task) {
767+
read_unlock(&tasklist_lock);
768+
return -ESRCH;
769+
}
770+
mm = get_task_mm(task);
771+
read_unlock(&tasklist_lock);
772+
773+
if (!mm)
774+
return -EINVAL;
775+
776+
/*
777+
* Check if this process has the right to modify the specified
778+
* process. The right exists if the process has administrative
779+
* capabilities, superuser priviledges or the same
780+
* userid as the target process.
781+
*/
782+
if ((current->euid != task->suid) && (current->euid != task->uid) &&
783+
(current->uid != task->suid) && (current->uid != task->uid) &&
784+
!capable(CAP_SYS_ADMIN)) {
785+
err = -EPERM;
786+
goto out;
787+
}
788+
789+
task_nodes = cpuset_mems_allowed(task);
790+
/* Is the user allowed to access the target nodes? */
791+
if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
792+
err = -EPERM;
793+
goto out;
794+
}
795+
796+
err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
797+
out:
798+
mmput(mm);
799+
return err;
800+
}
801+
802+
711803
/* Retrieve NUMA policy */
712804
asmlinkage long sys_get_mempolicy(int __user *policy,
713805
unsigned long __user *nmask,

0 commit comments

Comments
 (0)