PX2平台Pytorch源码编译

写在前面：以下内容完成于2019年底，只是把笔记放到了CSDN上。

需要注释掉NCLL及分布式相关的配置

libcudart.patch

diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 4591702..07e1268 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -59,7 +59,7 @@ def _load_cudart():
     if platform.system() == 'Windows':
         lib = find_cuda_windows_lib()
     else:
-        lib = ctypes.cdll.LoadLibrary(None)
+        lib = ctypes.cdll.LoadLibrary("libcudart.so")
     if hasattr(lib, 'cudaGetErrorName'):
         return lib

remove_nccl.patch

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 159b153..6f7423d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,7 +95,7 @@ option(USE_LMDB "Use LMDB" ON)
 option(USE_METAL "Use Metal for iOS build" ON)
 option(USE_MOBILE_OPENGL "Use OpenGL for mobile code" ON)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
-option(USE_NCCL "Use NCCL" ON)
+option(USE_NCCL "Use NCCL" OFF)
 option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF)
 option(USE_NNAPI "Use NNAPI" OFF)
 option(USE_NNPACK "Use NNPACK" ON)
@@ -119,7 +119,7 @@ option(USE_TENSORRT "Using Nvidia TensorRT library" OFF)
 option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
 option(USE_MKLDNN "Use MKLDNN" OFF)
-option(USE_DISTRIBUTED "Use distributed" ON)
+option(USE_DISTRIBUTED "Use distributed" OFF)
 cmake_dependent_option(
     USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
     "USE_DISTRIBUTED" OFF)
diff --git a/tools/setup_helpers/dist_check.py b/tools/setup_helpers/dist_check.py
index 8859fe1..5d2ed1c 100644
--- a/tools/setup_helpers/dist_check.py
+++ b/tools/setup_helpers/dist_check.py
@@ -6,7 +6,7 @@ from .env import IS_CONDA, IS_LINUX, IS_WINDOWS, CONDA_DIR, check_env_flag, chec
 from .cuda import USE_CUDA
  
 # On ROCm, RCCL development isn't complete. https://github.com/ROCmSoftwarePlatform/rccl
-USE_DISTRIBUTED = not check_negative_env_flag("USE_DISTRIBUTED") and not IS_WINDOWS and not check_env_flag("USE_ROCM")
+USE_DISTRIBUTED = False
 USE_GLOO_IBVERBS = False
  
 IB_DEVINFO_CMD = "ibv_devinfo"
diff --git a/tools/setup_helpers/nccl.py b/tools/setup_helpers/nccl.py
index c1cc886..576f74e 100644
--- a/tools/setup_helpers/nccl.py
+++ b/tools/setup_helpers/nccl.py
@@ -9,7 +9,7 @@ from .env import IS_WINDOWS, IS_DARWIN, IS_CONDA, CONDA_DIR, check_negative_env_
 from .cuda import USE_CUDA, CUDA_HOME
  
  
-USE_NCCL = USE_CUDA and not IS_DARWIN and not IS_WINDOWS
+USE_NCCL = False
 USE_SYSTEM_NCCL = False
 NCCL_LIB_DIR = None
 NCCL_SYSTEM_LIB = None
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 9c4a018..6849cb1 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -690,7 +690,7 @@ if (BUILD_PYTHON)
       list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
       list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
-      if (USE_CUDA)
+      if (USE_CUDA AND USE_NCCL)
         list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/distributed/c10d/ddp.cpp)
       endif()
     endif()
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 2b42e1e..11a866d 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -435,7 +435,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
           &::c10d::ProcessGroup::Work::wait,
           py::call_guard<py::gil_scoped_release>());
  
-#ifdef USE_CUDA
+#if defined(USE_CUDA) && defined(USE_TORCH)
   module.def(
       "_dist_bucket_tensors",
       &::c10d::bucketTensors,

iGPU上显示资源不足：iGPU的寄存器数据只有dGPU的一半，需要将降低CUDA的线程数

RuntimeError: cuda runtime error (7) : too many resources requested for launch at /opt/zhangmm/docker/pytorch_1.1.0/aten/src/THCUNN/generic/SpatialUpSamplingBilinear.cu:67

cankao: https://github.com/pytorch/pytorch/issues/8103#issuecomment-424343705

Pytorch on PX2

diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 70a7d05b6..48bf1173e 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -24,6 +24,8 @@ void initCUDAContextVectors() {
 void initDeviceProperty(DeviceIndex device_index) {
   cudaDeviceProp device_prop;
   AT_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, device_index));
+  // patch for "too many resources requested for launch"
+  device_prop.maxThreadsPerBlock = device_prop.maxThreadsPerBlock / 2;
   device_properties[device_index] = device_prop;
 }
  
diff --git a/aten/src/ATen/cuda/detail/KernelUtils.h b/aten/src/ATen/cuda/detail/KernelUtils.h
index e535f4d83..ac057c504 100644
--- a/aten/src/ATen/cuda/detail/KernelUtils.h
+++ b/aten/src/ATen/cuda/detail/KernelUtils.h
@@ -12,7 +12,10 @@ namespace at { namespace cuda { namespace detail {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
  
 // Use 1024 threads per block, which requires cuda sm_2x or above
-constexpr int CUDA_NUM_THREADS = 1024;
+//constexpr int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+constexpr int CUDA_NUM_THREADS = 512;
  
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
diff --git a/aten/src/THCUNN/common.h b/aten/src/THCUNN/common.h
index 9e3ed7d85..08fcb4532 100644
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@@ -9,7 +9,10 @@
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
  
 // Use 1024 threads per block, which requires cuda sm_2x or above
-const int CUDA_NUM_THREADS = 1024;
+//const int CUDA_NUM_THREADS = 1024;
+
+// patch for "too many resources requested for launch"
+const int CUDA_NUM_THREADS = 512;
  
 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)