From T3_KR_KNU
| Line 37: | Line 37: | ||
request_gpus = 1 | request_gpus = 1 | ||
=== Ex 1) Matrix === | |||
matrix.py | |||
import numpy as np | |||
from timeit import default_timer as timer | |||
from numba import vectorize | |||
@vectorize(['float32(float32, float32)'], target='cuda') | |||
def pow(a, b): | |||
return a ** b | |||
vec_size = 100000000 | |||
a = b = np.array(np.random.sample(vec_size), dtype=np.float32) | |||
c = np.zeros(vec_size, dtype=np.float32) | |||
start = timer() | |||
c = pow(a,b) | |||
duration = timer() - start | |||
print(duration) | |||
matrix.sh | |||
#!/bin/bash | |||
python3.6 -m virtualenv myvenv | |||
source myvenv/bin/activate | |||
pip3 install numba | |||
python3.6 matrix.py | |||
matrix.sub | |||
executable = matrix.sh | |||
arguments = $(ClusterId)$(ProcId) | |||
output = matrix.$(ClusterId).$(ProcId).out | |||
error = matrix.$(ClusterId).$(ProcId).err | |||
log = matrix.$(ClusterId).log | |||
should_transfer_files = YES | |||
transfer_input_files = matrix.py | |||
when_to_transfer_output = ON_EXIT | |||
request_GPUs = 1 | |||
request_CPUs = 1 | |||
queue | |||
=== Ex 2) TensorFlow === | |||
tf_ex.py | |||
import tensorflow as tf | |||
mnist = tf.keras.datasets.mnist | |||
(x_train, y_train), (x_test, y_test) = mnist.load_data() | |||
x_train, x_test = x_train / 255.0, x_test / 255.0 | |||
model = tf.keras.models.Sequential([ | |||
tf.keras.layers.Flatten(input_shape=(28, 28)), | |||
tf.keras.layers.Dense(128, activation='relu'), | |||
tf.keras.layers.Dropout(0.2), | |||
tf.keras.layers.Dense(10, activation='softmax') | |||
]) | |||
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy']) | |||
model.fit(x_train, y_train, epochs=5) | |||
model.evaluate(x_test, y_test, verbose=2) | |||
tf_ex.sh | |||
#!/bin/bash | |||
python3.6 -m virtualenv myvenv | |||
source myvenv/bin/activate | |||
pip3 install tensorflow-gpu==2.0.0-rc1 | |||
python3.6 tf_ex.py | |||
tf_ex.sub | |||
executable = tf_ex.sh | |||
arguments = $(ClusterId)$(ProcId) | |||
output = tf_ex.$(ClusterId).$(ProcId).out | |||
error = tf_ex.$(ClusterId).$(ProcId).err | |||
log = tf_ex.$(ClusterId).log | |||
transfer_input_files = tf_ex.py | |||
when_to_transfer_output = ON_EXIT | |||
request_GPUs = 1 | |||
request_CPUs = 1 | |||
queue | |||
=== Ex 3) Singularity & TensorFlow - cvmfs image === | |||
sing.sh | |||
#!/bin/bash | |||
python3.6 tf_ex.py | |||
sing.sub | |||
arguments = $(ClusterId)$(ProcId) | |||
output = sing.$(ClusterId).$(ProcId).out | |||
error = sing.$(ClusterId).$(ProcId).err | |||
log = sing.$(ClusterId).log | |||
should_transfer_files = YES | |||
when_to_transfer_output = ON_EXIT | |||
transfer_input_files = tf_ex.py | |||
request_GPUs = 1 | |||
request_CPUs = 1 | |||
+SingularityImage = "/cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest" | |||
queue | |||
=== Ex 4) Singularity & TensorFlow - local image === | |||
sing-local.sub | |||
executable = sing.sh | |||
arguments = $(ClusterId)$(ProcId) | |||
output = sing.$(ClusterId).$(ProcId).out | |||
error = sing.$(ClusterId).$(ProcId).err | |||
log = sing.$(ClusterId).log | |||
should_transfer_files = YES | |||
when_to_transfer_output = ON_EXIT | |||
transfer_input_files = tf_ex.py | |||
request_GPUs = 1 | |||
request_CPUs = 1 | |||
+SingularityImage = "/u/user/hanbi/tensorflow-gpu.sif" | |||
queue | |||
=== Ex 5) Singularity & TensorFlow - docker image === | |||
sing-docker.sub | |||
executable = sing.sh | |||
arguments = $(ClusterId)$(ProcId) | |||
output = sing.$(ClusterId).$(ProcId).out | |||
error = sing.$(ClusterId).$(ProcId).err | |||
log = sing.$(ClusterId).log | |||
should_transfer_files = YES | |||
when_to_transfer_output = ON_EXIT | |||
transfer_input_files = tf_ex.py | |||
request_GPUs = 1 | |||
request_CPUs = 1 | |||
+SingularityImage = "docker://tensorflow/tensorflow:latest-gpu" | |||
queue | |||
=== 참고자료 === | |||
위의 예제들은 아래 문서들에서 참고했습니다. 좋은 내용이 많으니 이용에 참고하시기 바랍니다. | |||
* https://batchdocs.web.cern.ch/tutorial/exercise10.html | |||
* https://www.tensorflow.org/tutorials/quickstart/beginner | |||
* https://sylabs.io/guides/3.6/user-guide/singularity_and_docker.html | |||
* https://sylabs.io/guides/3.6/user-guide/gpu.html | |||
Latest revision as of 06:37, 21 September 2023
HTCondor에서 GPU 사용하기
경북대학교 중점연구소 GPU 자원이 CMS Tier3 HTCondor Farm에 통합되었습니다.
GPU 사용대상
- CMS 사용자중 신청자
GPU 사용신청
- 기존 CMS 사용자: 사용계획서(기존사용자용).hwp를 간단하게 작성하여 lcg_knu@knu.ac.kr로 제출하시면 됩니다.
- 신규 CMS 사용자: 사용계획서(신규사용자용).hwp를 작성하여 lcg_knu@knu.ac.kr로 제출하시면 됩니다.
- 별도 신청서를 받는 이유는 구체적인 수요 현황을 파악하여 향후 자원 증설시 반영하고자 합니다. 양해를 부탁드립니다.
GPU 사용
- GPU 상태 확인
$ condor_status -compact -constraint 'TotalGpus > 0'
Machine Platform Slots Cpus Gpus TotalGb FreCpu FreeGb CpuLoad ST Jobs/Min MaxSlotGb
dm01.knu.ac.kr x64/CentOS7 0 40 3 188.96 40 188.96 0.00 Ui 0.00 *
dm02.knu.ac.kr x64/CentOS7 0 48 4 377.93 48 377.93 0.00 Ui 0.00 *
Machines Owner Claimed Unclaimed Matched Preempting Drain
x64/CentOS7 2 0 0 2 0 0 0
Total 2 0 0 2 0 0 0
$ condor_status -compact -constraint 'TotalGpus > 0' -af Machine TotalGpus CUDADeviceName CUDACapability dm01.knu.ac.kr 3 TITAN V 1024.64 dm02.knu.ac.kr 4 TITAN Xp 1024.64
- Job submit시 다음 옵션을 추가하면 GPU를 사용할 수 있습니다.
request_gpus = 1
Ex 1) Matrix
matrix.py
import numpy as np
from timeit import default_timer as timer
from numba import vectorize
@vectorize(['float32(float32, float32)'], target='cuda')
def pow(a, b):
return a ** b
vec_size = 100000000
a = b = np.array(np.random.sample(vec_size), dtype=np.float32)
c = np.zeros(vec_size, dtype=np.float32)
start = timer()
c = pow(a,b)
duration = timer() - start
print(duration)
matrix.sh
#!/bin/bash python3.6 -m virtualenv myvenv source myvenv/bin/activate pip3 install numba python3.6 matrix.py
matrix.sub
executable = matrix.sh arguments = $(ClusterId)$(ProcId) output = matrix.$(ClusterId).$(ProcId).out error = matrix.$(ClusterId).$(ProcId).err log = matrix.$(ClusterId).log should_transfer_files = YES transfer_input_files = matrix.py when_to_transfer_output = ON_EXIT request_GPUs = 1 request_CPUs = 1 queue
Ex 2) TensorFlow
tf_ex.py
import tensorflow as tf mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation='softmax') ]) model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy']) model.fit(x_train, y_train, epochs=5) model.evaluate(x_test, y_test, verbose=2)
tf_ex.sh
#!/bin/bash python3.6 -m virtualenv myvenv source myvenv/bin/activate pip3 install tensorflow-gpu==2.0.0-rc1 python3.6 tf_ex.py
tf_ex.sub
executable = tf_ex.sh arguments = $(ClusterId)$(ProcId) output = tf_ex.$(ClusterId).$(ProcId).out error = tf_ex.$(ClusterId).$(ProcId).err log = tf_ex.$(ClusterId).log transfer_input_files = tf_ex.py when_to_transfer_output = ON_EXIT request_GPUs = 1 request_CPUs = 1 queue
Ex 3) Singularity & TensorFlow - cvmfs image
sing.sh
#!/bin/bash python3.6 tf_ex.py
sing.sub
arguments = $(ClusterId)$(ProcId) output = sing.$(ClusterId).$(ProcId).out error = sing.$(ClusterId).$(ProcId).err log = sing.$(ClusterId).log should_transfer_files = YES when_to_transfer_output = ON_EXIT transfer_input_files = tf_ex.py request_GPUs = 1 request_CPUs = 1 +SingularityImage = "/cvmfs/singularity.opensciencegrid.org/opensciencegrid/tensorflow-gpu:latest" queue
Ex 4) Singularity & TensorFlow - local image
sing-local.sub
executable = sing.sh arguments = $(ClusterId)$(ProcId) output = sing.$(ClusterId).$(ProcId).out error = sing.$(ClusterId).$(ProcId).err log = sing.$(ClusterId).log should_transfer_files = YES when_to_transfer_output = ON_EXIT transfer_input_files = tf_ex.py request_GPUs = 1 request_CPUs = 1 +SingularityImage = "/u/user/hanbi/tensorflow-gpu.sif" queue
Ex 5) Singularity & TensorFlow - docker image
sing-docker.sub
executable = sing.sh arguments = $(ClusterId)$(ProcId) output = sing.$(ClusterId).$(ProcId).out error = sing.$(ClusterId).$(ProcId).err log = sing.$(ClusterId).log should_transfer_files = YES when_to_transfer_output = ON_EXIT transfer_input_files = tf_ex.py request_GPUs = 1 request_CPUs = 1 +SingularityImage = "docker://tensorflow/tensorflow:latest-gpu" queue
참고자료
위의 예제들은 아래 문서들에서 참고했습니다. 좋은 내용이 많으니 이용에 참고하시기 바랍니다.