VMEM is not enforced


#1

Hi,
I am testing the the vmem limit enforcement refer to 5.14.3.3 “Configuring Per-job Resource Limit Enforcement at Vnodes”.
But, vmem is not killed forcibly.

I had expected that the following message would be output.
=>> PBS: job killed: vmem xxxxxxxx exceeded limit yyyyyy.

However, the job ended normally.
Why is the job not killed due to vmem limitation?

  1. Running program (Allocate 1gb memory).

    [test@sl02-sms ~]$ cat mallc.c
    #include <stdio.h>
    #include <stdlib.h>
    #include <unistd.h>
    #include <malloc.h>

    #define ALLOC_SIZE 4

    int main(){
    int sleep_time=120;
    char *mem=NULL;
    char memadd[ALLOC_SIZE];
    int i, j;
    size_t size= 512 * 512 * 1024; /
    250mb */

        while(sleep_time){
                sleep_time = sleep(sleep_time);
        }
    
        for(i=0; i<ALLOC_SIZE; i++){
                mem = (char *)malloc(size);
                size_t alloc_size = malloc_usable_size(mem);
                if(!mem){
                        printf("not allocate memory size=%d\n", alloc_size);
                        return -1;
                }
                printf("memory allocated size=%d\n", alloc_size);
                for(j=0; j<size; j++){
                        mem[j] = 0x00;
                }
                memadd[i]=mem;
                sleep(10);
        }
    
        while(sleep_time){
                sleep_time = sleep(sleep_time);
        }
    
        for(i=0; i<ALLOC_SIZE; i++){
                 free(memadd[i]);
        }
        return 0;
    

    }

  2. Running script (5bg per process).
    [test@sl02-sms ~]$ cat mallc.sh
    #!/bin/bash
    #PBS -N test
    #PBS -j oe

    echo “resource limit information”
    ulimit -a
    echo “1 process runnning”
    ./mallc &
    echo “2 process runnning”
    ./mallc &
    echo “3 process runnning”
    ./mallc &
    echo “4 process runnning”
    ./mallc &
    echo “5 process runnning”
    ./mallc

    [test@sl02-sms ~]$

  3. Running job.

[test@sl02-sms ~]$ qsub -l pvmem=2gb,vmem=4gb mallc.sh

1443.sl02-sms
[test@sl02-sms ~]$

!!! vmem was not killed by job limit. !!!

[test@sl02-sms ~]$ cat test.o1443
resource limit information
core file size (blocks, -c) 0
data seg size (kbytes, -d) unlimited
scheduling priority (-e) 0
file size (blocks, -f) unlimited
pending signals (-i) 63668
max locked memory (kbytes, -l) unlimited
max memory size (kbytes, -m) unlimited
open files (-n) 1024
pipe size (512 bytes, -p) 8
POSIX message queues (bytes, -q) 819200
real-time priority (-r) 0
stack size (kbytes, -s) 16384
cpu time (seconds, -t) unlimited
max user processes (-u) 63668
virtual memory (kbytes, -v) 2097152
file locks (-x) unlimited
1 process runnning
2 process runnning
3 process runnning
4 process runnning
5 process runnning
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536

[test@sl02-sms ~]$ qstat -f
Job Id: 1443.sl02-sms
Job_Name = test
Job_Owner = test@sl02-sms
resources_used.cpupercent = 0
resources_used.cput = 00:00:00
resources_used.mem = 5124kb
resources_used.ncpus = 1
resources_used.vmem = 249844kb
resources_used.walltime = 00:00:12
job_state = R
queue = workq
server = sl02-sms
Checkpoint = u
ctime = Fri Jan 18 11:31:05 2019
Error_Path = sl02-sms:/home/test/test.e1443
exec_host = sl02-c001/0
exec_vnode = (sl02-c001:ncpus=1)
Hold_Types = n
Join_Path = oe
Keep_Files = n
Mail_Points = a
mtime = Fri Jan 18 11:31:05 2019
Output_Path = sl02-sms:/home/test/test.o1443
Priority = 0
qtime = Fri Jan 18 11:31:05 2019
Rerunable = True
Resource_List.ncpus = 1
Resource_List.nodect = 1
Resource_List.place = pack
Resource_List.pvmem = 2gb
Resource_List.select = 1:ncpus=1:vmem=4gb
Resource_List.vmem = 4gb

[root@sl02-c001 ~]# ps -aef | grep mallc |grep -v grep
test 25586 25585 0 11:25 ? 00:00:00 ./mallc
test 25587 25585 0 11:25 ? 00:00:00 ./mallc
test 25588 25585 0 11:25 ? 00:00:00 ./mallc
test 25589 25585 0 11:25 ? 00:00:00 ./mallc
test 25590 25585 0 11:25 ? 00:00:00 ./mallc
[root@sl02-c001 ~]#

My environment is below.
[test@sl02-sms ~]$ qsub --version
pbs_version = 14.1.2

[root@sl02-sms ~]# grep ^resources /var/spool/pbs/sched_priv/sched_config
resources: “ncpus, mem, arch, host, vnode, netwins, aoe”
[root@sl02-sms ~]#

[test@sl02-sms ~]$ pbsnodes -av
sl02-c001
Mom = sl02-c001.localdomain
ntype = PBS
state = free
pcpus = 12
resources_available.arch = linux
resources_available.host = sl02-c001
resources_available.mem = 16363572kb
resources_available.ncpus = 12
resources_available.vnode = sl02-c001
resources_assigned.accelerator_memory = 0kb
resources_assigned.dyna-license = 0
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.netwins = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared

sl02-c002
Mom = sl02-c002.localdomain
ntype = PBS
state = free
pcpus = 12
resources_available.arch = linux
resources_available.host = sl02-c002
resources_available.mem = 16363572kb
resources_available.ncpus = 12
resources_available.vnode = sl02-c002
resources_assigned.accelerator_memory = 0kb
resources_assigned.dyna-license = 0
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.netwins = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared

[test@sl02-sms ~]$

[root@sl02-c001 ~]# cat /var/spool/pbs/mom_priv/config
$clienthost sl02-sms
$usecp *:/home /home
$restrict_user_maxsysid 999
$enforce mem