apiVersion: v1 kind: ConfigMap metadata: name: rings-config-mindx-dls-test # The value of JobName must be the same as the name attribute of the following job. The prefix rings-config- cannot be modified. namespace: vcjob # Select a proper namespace based on the site requirements. (The namespaces of ConfigMap and Job must be the same. In addition, if the tjm component of MindX-add exists, the vcjob namespace cannot be used.) labels: ring-controller.atlas: ascend-910 # The value cannot be modified. Service operations will be performed based on this label. data: hccl.json: | { "status":"initializing" } --- apiVersion: batch.volcano.sh/v1alpha1 # The value cannot be changed. The volcano API must be used. kind: Job # Only the job type is supported at present. metadata: name: mindx-dls-test # The value must be consistent with the name of ConfigMap. namespace: vcjob # Select a proper namespace based on the site requirements. (The namespaces of ConfigMap and Job must be the same. In addition, if the tjm component of MindX-add exists, the vcjob namespace cannot be used.) labels: ring-controller.atlas: ascend-910 # The value must be the same as the label in ConfigMap and cannot be changed. fault-scheduling: "force" spec: minAvailable: 1 # The value of minAvailable is 1 in a single-node scenario and N in an N-node distributed scenario. schedulerName: volcano # Use the Volcano scheduler to schedule jobs. policies: - event: PodEvicted action: RestartJob plugins: ssh: [] env: [] svc: [] maxRetry: 3 queue: default tasks: - name: "default-test" replicas: 1 # The value of replicas is 1 in a single-node scenario and N in an N-node scenario. The number of NPUs in the requests field is 8 in an N-node scenario. template: metadata: labels: app: pytorch ring-controller.atlas: ascend-910 # The value must be the same as the label in ConfigMap and cannot be changed. spec: affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: volcano.sh/job-name operator: In values: - mindx-dls-test topologyKey: kubernetes.io/hostname hostNetwork: true containers: - image: torch:b030 # Training framework image, which can be modified. imagePullPolicy: IfNotPresent name: pytorch env: - name: mindx-dls-test # The value must be the same as that of Jobname. valueFrom: fieldRef: fieldPath: metadata.name - name: XDL_IP # IP address of the physical node, which is used to identify the node where the pod is running valueFrom: fieldRef: fieldPath: status.hostIP - name: framework value: "PyTorch" command: - "/bin/bash" - "-c" # Commands for running the training script. Ensure that the involved commands and paths exist on Docker. - "cd /job/code/ResNet50_for_PyTorch_1.8_code/scripts;chmod +x train_start.sh;bash train_start.sh /job/code/ResNet50_for_PyTorch_1.8_code/ /job/output/ DistributedResnet50/main_apex_d76_npu.py --data=/job/data/resnet50/imagenet --seed=49 --worker=128 --learning-rate=1.6 --warmup=8 --label-smoothing=0.1 --mom=0.9 --weight-decay=1.0e-04 --static-loss-scale=128 --print-freq=1 --dist-url='tcp://127.0.0.1:50000' --dist-backend='hccl' --multiprocessing-distributed --benchmark=0 --device='npu' --epoch=90 --batch-size=1024;" #args: [ "while true; do sleep 30000; done;" ] # Comment out the preceding line and enable this line. You can manually run the training script in the container to facilitate debugging. # The command is 'kubectl exec -it -n {namespace} {podname} bash' resources: requests: huawei.com/Ascend910: 8 # Number of required NPUs. The maximum value is 8. You can add lines below to configure resources such as memory and CPU. limits: huawei.com/Ascend910: 8 # The value must be consistent with that in requests. volumeMounts: - name: ascend-910-config mountPath: /user/serverid/devindex/config - name: code mountPath: /job/code/ # Path of the training script in the container. - name: data mountPath: /job/data # Path of the training dataset in the container. - name: output mountPath: /job/output # Training output path in the container. - name: slog mountPath: /var/log/npu - name: ascend-driver mountPath: /usr/local/Ascend/driver - name: ascend-add-ons mountPath: /usr/local/Ascend/add-ons - name: dshm mountPath: /dev/shm - name: localtime mountPath: /etc/localtime nodeSelector: host-arch: huawei-arm # Configure the label based on the actual job. volumes: - name: ascend-910-config configMap: name: rings-config-mindx-dls-test # Correspond to the ConfigMap name above. - name: code nfs: server: 127.0.0.1 # IP address of the NFS server. In this example, the shared path is /data/atlas_dls/. path: "/data/atlas_dls/public/code/" # Configure the training script path. - name: data nfs: server: 127.0.0.1 path: "/data/atlas_dls/public/dataset" # Configure the path of the training set. - name: output nfs: server: 127.0.0.1 path: "/data/atlas_dls/output/" # Configure the path for saving the configuration model, which is related to the script. - name: slog hostPath: path: /var/log/npu # Configure the NPU log path and mount it to the local host. - name: ascend-driver hostPath: path: /usr/local/Ascend/driver - name: ascend-add-ons hostPath: path: /usr/local/Ascend/add-ons - name: localtime hostPath: path: /etc/localtime # Configure the Docker time. - name: dshm emptyDir: medium: Memory sizeLimit: 16Gi restartPolicy: OnFailure
标签:name,--,value,job,path,vcjob,data From: https://www.cnblogs.com/vmsysjack/p/17658840.html