Job gettting submitted to only 01 node in 04 node cluster

#1

When I submit a job to a cluster of 04 nodes and 01 head node, job gets allotted to only one node. Is there any specific configuration that I could have left out?

[hpcuser@mgt1 ~]$ qsub -I -l select=2:ncpus=12;mpiprocs=12
qsub: waiting for job 7.mgt1 to start
qsub: job 7.mgt1 ready

[hpcuser@mgt1 ~]$ qsub -I -l select=2:ncpus=12:mpiprocs=12

[hpcuser@mgt1 ~]$ qstat -an

mgt1:
Req’d Req’d Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time


1.mgt1 hpcuser workq STDIN 25701 2 20 – – R 01:26
node01/010+node01/110
7.mgt1 hpcuser workq STDIN 26470 2 24 – – R 00:00
node01/212+node01/312
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
The qstat -f of job id 1.mgt1 is below

[root@mgt1 test]# qstat -f
Job Id: 1.mgt1
Job_Name = STDIN
Job_Owner = hpcuser@mgt1
resources_used.cpupercent = 98
resources_used.cput = 00:45:26
resources_used.mem = 48141760kb
resources_used.ncpus = 20
resources_used.vmem = 48904932kb
resources_used.walltime = 01:23:10
job_state = R
queue = workq
server = mgt1
Checkpoint = u
ctime = Thu Mar 21 18:34:41 2019
Error_Path = /dev/pts/0
exec_host = node01/010+node01/110
exec_vnode = (node01:ncpus=10)+(node01:ncpus=10)
Hold_Types = n
interactive = True
Join_Path = n
Keep_Files = n
Mail_Points = a
mtime = Thu Mar 21 19:57:50 2019
Output_Path = /dev/pts/0
Priority = 0
qtime = Thu Mar 21 18:34:41 2019
Rerunable = False
Resource_List.ncpus = 20
Resource_List.nodect = 2
Resource_List.place = free
Resource_List.select = 2:ncpus=10
schedselect = 2:ncpus=10
stime = Thu Mar 21 18:34:41 2019
session_id = 25701
jobdir = /nfsshare/home/hpcuser
substate = 42
Variable_List = PBS_O_HOME=/nfsshare/home/hpcuser,PBS_O_LANG=en_US.UTF-8,
PBS_O_LOGNAME=hpcuser,
PBS_O_PATH=/opt/xcat/bin:/opt/xcat/sbin:/opt/xcat/share/xcat/tools:/op
t/hpc/ferret/ferret-7.4.4-RHEL7-64/bin:/opt/xcat/bin:/opt/xcat/sbin:/op
t/xcat/share/xcat/tools:/usr/local/bin:/usr/local/sbin:/usr/bin:/usr/sb
in:/bin:/sbin:/opt/pbs/bin:/nfsshare/home/hpcuser/.local/bin:/nfsshare/
home/hpcuser/bin:/opt/pbs/bin,PBS_O_MAIL=/var/spool/mail/hpcuser,
PBS_O_SHELL=/bin/bash,PBS_O_WORKDIR=/nfsshare/home/hpcuser,
PBS_O_SYSTEM=Linux,PBS_O_QUEUE=workq,PBS_O_HOST=mgt1
euser = hpcuser
egroup = hpcuser
hashname = 1.mgt1
queue_rank = 1553173481022
queue_type = E
comment = Job run at Thu Mar 21 at 18:34 on (node01:ncpus=10)+(node01:ncpus
=10)
etime = Thu Mar 21 18:34:41 2019
run_count = 1
Submit_arguments = -I -X -l select=2:ncpus=10
project = _pbs_project_default
forward_x11_cookie = MIT-MAGIC-COOKIE-1:39c4016e0ca635f02ef3b35c73e44524:0
forward_x11_port = True
run_version = 1
+++++++++++++++++++++++++++++++++++++++++++++++++++
The pbsnodes -a shows the nodes node details as below

[root@mgt1 ~]# pbsnodes -a
node01
Mom = node01
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 48
resources_available.arch = linux
resources_available.host = node01
resources_available.mem = 65184884kb
resources_available.ncpus = 48
resources_available.vnode = node01
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Mar 21 13:43:54 2019

node02
Mom = node02
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 48
resources_available.arch = linux
resources_available.host = node02
resources_available.mem = 65184884kb
resources_available.ncpus = 48
resources_available.vnode = node02
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Mar 21 13:43:54 2019

node03
Mom = node03
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 48
resources_available.arch = linux
resources_available.host = node03
resources_available.mem = 65184884kb
resources_available.ncpus = 48
resources_available.vnode = node03
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Mar 21 13:43:54 2019

node04
Mom = node04
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 24
resources_available.arch = linux
resources_available.host = node04
resources_available.mem = 65185004kb
resources_available.ncpus = 24
resources_available.vnode = node04
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Mar 21 13:43:54 2019

0 Likes

#2

Try submitting jobs as below:
qsub -I -l select=2:ncpus=12;mpiprocs=12 -l place=scatter

This place statement tells that you want 1 chunk to be taken from two different nodes.
So it will now take 12 cores from one node and another 12 cores from another node.

If you do not specify place, it will default to free/pack, you have 24 cores available on one system, hence both the chunks will be given/taken from one node

I hope this works for you

0 Likes