PX2平台Pytorch源码编译

news/2024/7/8 7:36:32 标签: pytorch, 人工智能, python

写在前面:以下内容完成于2019年底,只是把笔记放到了CSDN上。

需要注释掉NCLL及分布式相关的配置

libcudart.patch

diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 4591702..07e1268 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -59,7 +59,7 @@ def _load_cudart():
     if platform.system() == 'Windows':
         lib = find_cuda_windows_lib()
     else:
-        lib = ctypes.cdll.LoadLibrary(None)
+        lib = ctypes.cdll.LoadLibrary("libcudart.so")
     if hasattr(lib, 'cudaGetErrorName'):
         return lib

remove_nccl.patch

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 159b153..6f7423d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,7 +95,7 @@ option(USE_LMDB "Use LMDB" ON)
 option(USE_METAL "Use Metal for iOS build" ON)
 option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
-option(USE_NCCL "Use NCCL" ON)
+option(USE_NCCL "Use NCCL" OFF)
 option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
 option(USE_NNAPI "Use NNAPI" OFF)
 option(USE_NNPACK "Use NNPACK" ON)
@@ -119,7 +119,7 @@ option(USE_TENSORRT "Using Nvidia TensorRT library" OFF)
 option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
 option(USE_MKLDNN "Use MKLDNN" OFF)
-option(USE_DISTRIBUTED "Use distributed" ON)
+option(USE_DISTRIBUTED "Use distributed" OFF)
 cmake_dependent_option(
     USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
     "USE_DISTRIBUTED" OFF)
diff --git a/tools/setup_helpers/dist_check.py b/tools/setup_helpers/dist_check.py
index 8859fe1..5d2ed1c 100644
--- a/tools/setup_helpers/dist_check.py
+++ b/tools/setup_helpers/dist_check.py
@@ -6,7 +6,7 @@ from .env import IS_CONDA, IS_LINUX, IS_WINDOWS, CONDA_DIR, check_env_flag, chec
 from .cuda import USE_CUDA
  
 # On ROCm, RCCL development isn't complete. https://github.com/ROCmSoftwarePlatform/rccl
-USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS and not check_env_flag("USE_ROCM")
+USE_DISTRIBUTED = False
 USE_GLOO_IBVERBS = False
  
 IB_DEVINFO_CMD = "ibv_devinfo"
diff --git a/tools/setup_helpers/nccl.py b/tools/setup_helpers/nccl.py
index c1cc886..576f74e 100644
--- a/tools/setup_helpers/nccl.py
+++ b/tools/setup_helpers/nccl.py
@@ -9,7 +9,7 @@ from .env import IS_WINDOWS, IS_DARWIN, IS_CONDA, CONDA_DIR, check_negative_env_
 from .cuda import USE_CUDA, CUDA_HOME
  
  
-USE_NCCL = USE_CUDA and not IS_DARWIN and not IS_WINDOWS
+USE_NCCL = False
 USE_SYSTEM_NCCL = False
 NCCL_LIB_DIR = None
 NCCL_SYSTEM_LIB = None
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 9c4a018..6849cb1 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -690,7 +690,7 @@ if (BUILD_PYTHON)
       list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
       list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
-      if (USE_CUDA)
+      if (USE_CUDA AND USE_NCCL)
         list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/ddp.cpp)
       endif()
     endif()
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 2b42e1e..11a866d 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -435,7 +435,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
           &::c10d::ProcessGroup::Work::wait,
           py::call_guard<py::gil_scoped_release>());
  
-#ifdef USE_CUDA
+#if defined(USE_CUDA) && defined(USE_TORCH)
   module.def(
       "_dist_bucket_tensors",
       &::c10d::bucketTensors,

iGPU上显示资源不足:iGPU的寄存器数据只有dGPU的一半,需要将降低CUDA的线程数

RuntimeError: cuda runtime error (7) : too many resources requested for launch at /opt/zhangmm/docker/pytorch_1.1.0/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu:67

cankao: https://github.com/pytorch/pytorch/issues/8103#issuecomment-424343705

Pytorch on PX2

diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 70a7d05b6..48bf1173e 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+  // patch for "too many resources requested for launch"
+  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }
  
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index e535f4d83..ac057c504 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -12,7 +12,10 @@ namespace at { namespace cuda { namespace detail {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
  
 // Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
  
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 9e3ed7d85..08fcb4532 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -9,7 +9,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
  
 // Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//const int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+const int CUDA_NUM_THREADS = 512;
  
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)

http://www.niftyadmin.cn/n/5536867.html

相关文章

C++: Map数组的遍历

在C中&#xff0c;map是一个关联容器&#xff0c;它存储的元素是键值对&#xff08;key-value pairs&#xff09;&#xff0c;其中每个键都是唯一的&#xff0c;并且自动根据键来排序。遍历map的方式有几种&#xff0c;但最常用的两种是使用迭代器&#xff08;iterator&#xf…

密码学原理精解【5】

这里写目录标题 移位密码概述代码 希尔密码( Z 256 Z_{256} Z256​)待加密长度被3整除待加密长度不一定被3整除加解密文件 移位密码 概述 以 z 26 运算为例 , k 为密钥 加密&#xff1a; e k ( x ) ( x k ) m o d 26 解密&#xff1a; d k ( x ) ( x − k ) m o d 26 以z_{…

C#Winform窗体中嵌入exe文件

1&#xff0c;效果以嵌入Modbus Slave为例&#xff1a; 2&#xff0c;代码&#xff1a; public partial class Form1 : Form{//设置嵌入exe的常量private const int nIndex -16;private const int dwNewLong 0x10000000;Process m_AppProcess;public Form1(){InitializeCompo…

c进阶篇(四):内存函数

内存函数以字节为单位更改 1.memcpy memcpy 是 C/C 中的一个标准库函数&#xff0c;用于内存拷贝操作。它的原型通常定义在 <cstring> 头文件中&#xff0c;其作用是将一块内存中的数据复制到另一块内存中。 函数原型&#xff1a;void *memcpy(void *dest, const void…

ffmpeg使用mjpeg把yuvj420p编码为jpg图像

version #define LIBAVCODEC_VERSION_MAJOR 60 #define LIBAVCODEC_VERSION_MINOR 15 #define LIBAVCODEC_VERSION_MICRO 100 note 1. 通过*.jpg推测时&#xff0c;out_fmt为image2&#xff0c;打开*.jpg文件时&#xff0c;in_fmt为image2 但是out_fmt为image2时&…

中英双语介绍美国的州:阿肯色州(Arkansas)

中文版 阿肯色州简介 阿肯色州&#xff08;Arkansas&#xff09;位于美国南部&#xff0c;以其多样的自然景观、丰富的文化遗产和不断发展的经济而闻名。以下是对阿肯色州的详细介绍&#xff0c;包括其地理位置、人口、经济、教育、文化和主要城市。 地理位置 阿肯色州东临…

【经验总结】Springboot打印指定类的日志到指定文件中

原文地址&#xff1a;https://www.cnblogs.com/zeng1994/p/f9bff238b13a0bf8fb8bf88c41db7a34.html 以下内容已经过实践&#xff0c;勘误&#xff0c;总结 环境&#xff1a;Springboot2.5.2 公司有个项目&#xff0c;需要和几个第三方系统对接。这种项目&#xff0c;日志一定要…

C++ | Leetcode C++题解之第214题最短回文串

题目&#xff1a; 题解&#xff1a; class Solution { public:string shortestPalindrome(string s) {int n s.size();vector<int> fail(n, -1);for (int i 1; i < n; i) {int j fail[i - 1];while (j ! -1 && s[j 1] ! s[i]) {j fail[j];}if (s[j 1] …