405 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Smarty
		
	
	
	
			
		
		
	
	
			405 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Smarty
		
	
	
	
| """Repository rule for NCCL."""
 | |
| 
 | |
| load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
 | |
| load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 | |
| 
 | |
| # CUDA toolkit version as tuple (e.g. '(11, 1)').
 | |
| _cuda_version = %{cuda_version}
 | |
| 
 | |
| def _gen_device_srcs_impl(ctx):
 | |
|     ops = ["sum", "prod", "min", "max"]
 | |
|     types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
 | |
|     hdr_tail = "****************************************/"
 | |
|     defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d"
 | |
| 
 | |
|     files = []
 | |
|     for NCCL_OP, op in enumerate(ops):
 | |
|         for NCCL_TYPE, dt in enumerate(types):
 | |
|             substitutions = {
 | |
|                 hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE),
 | |
|             }
 | |
|             for src in ctx.files.srcs:
 | |
|                 name = "%s_%s_%s" % (op, dt, src.basename)
 | |
|                 file = ctx.actions.declare_file(name, sibling = src)
 | |
|                 ctx.actions.expand_template(
 | |
|                     output = file,
 | |
|                     template = src,
 | |
|                     substitutions = substitutions,
 | |
|                 )
 | |
|                 files.append(file)
 | |
|     return [DefaultInfo(files = depset(files))]
 | |
| 
 | |
| gen_device_srcs = rule(
 | |
|     implementation = _gen_device_srcs_impl,
 | |
|     attrs = {
 | |
|         "srcs": attr.label_list(allow_files = True),
 | |
|     },
 | |
| )
 | |
| """Adds prefix to each file name in srcs and adds #define NCCL_OP."""
 | |
| 
 | |
| def _rdc_copts():
 | |
|     """Returns copts for compiling relocatable device code."""
 | |
| 
 | |
|     # The global functions can not have a lower register count than the
 | |
|     # device functions. This is enforced by setting a fixed register count.
 | |
|     # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
 | |
|     maxrregcount = "-maxrregcount=96"
 | |
| 
 | |
|     return cuda_default_copts() + select({
 | |
|         "@local_config_cuda//:is_cuda_compiler_nvcc": [
 | |
|             "-nvcc_options",
 | |
|             "relocatable-device-code=true",
 | |
|             "-nvcc_options",
 | |
|             "ptxas-options=" + maxrregcount,
 | |
|         ],
 | |
|         "@local_config_cuda//:is_cuda_compiler_clang": [
 | |
|             "-fcuda-rdc",
 | |
|             "-Xcuda-ptxas",
 | |
|             maxrregcount,
 | |
|         ],
 | |
|         "//conditions:default": [],
 | |
|     })
 | |
| 
 | |
| def _lookup_file(filegroup, path):
 | |
|     """Extracts file at (relative) path in filegroup."""
 | |
|     for file in filegroup.files:
 | |
|         if file.path.endswith(path):
 | |
|             return file
 | |
|     return None
 | |
| 
 | |
| def _pic_only(files):
 | |
|     """Returns the PIC files if there are any in 'files', otherwise 'files'."""
 | |
|     pic_only = [f for f in files if f.basename.find(".pic.") >= 0]
 | |
|     return pic_only if pic_only else files
 | |
| 
 | |
| def _device_link_impl(ctx):
 | |
|     if not ctx.attr.gpu_archs:
 | |
|         fail("No GPU architecture specified. NCCL requires --config=cuda or similar.")
 | |
| 
 | |
|     inputs = []
 | |
|     for dep in ctx.attr.deps:
 | |
|         inputs += dep.files.to_list()
 | |
|     inputs = _pic_only(inputs)
 | |
| 
 | |
|     # Device-link to cubins for each architecture.
 | |
|     name = ctx.attr.name
 | |
|     register_h = None
 | |
|     cubins = []
 | |
|     images = []
 | |
|     for arch in ctx.attr.gpu_archs:
 | |
|         arch = arch.replace("compute_", "sm_")  # PTX is JIT-linked at runtime.
 | |
|         cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
 | |
|         register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
 | |
|         ctx.actions.run(
 | |
|             outputs = [register_h, cubin],
 | |
|             inputs = inputs,
 | |
|             executable = ctx.file._nvlink,
 | |
|             arguments = ctx.attr.nvlink_args + [
 | |
|                 "--arch=%s" % arch,
 | |
|                 "--register-link-binaries=%s" % register_h.path,
 | |
|                 "--output-file=%s" % cubin.path,
 | |
|             ] + [file.path for file in inputs],
 | |
|             mnemonic = "nvlink",
 | |
|         )
 | |
|         cubins.append(cubin)
 | |
|         images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
 | |
| 
 | |
|     # Generate fatbin header from all cubins.
 | |
|     tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name)
 | |
|     fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
 | |
|     bin2c = ctx.file._bin2c
 | |
|     arguments_list = [
 | |
|         "-64",
 | |
|         "--cmdline=--compile-only",
 | |
|         "--link",
 | |
|         "--compress-all",
 | |
|         "--create=%s" % tmp_fatbin.path,
 | |
|         "--embedded-fatbin=%s" % fatbin_h.path,
 | |
|     ]
 | |
|     if _cuda_version <= (10, 1):
 | |
|         arguments_list.append("--bin2c-path=%s" % bin2c.dirname)
 | |
|     ctx.actions.run(
 | |
|         outputs = [tmp_fatbin, fatbin_h],
 | |
|         inputs = cubins,
 | |
|         executable = ctx.file._fatbinary,
 | |
|         arguments = arguments_list + images,
 | |
|         tools = [bin2c],
 | |
|         mnemonic = "fatbinary",
 | |
|     )
 | |
| 
 | |
|     # Generate the source file #including the headers generated above.
 | |
|     ctx.actions.expand_template(
 | |
|         output = ctx.outputs.out,
 | |
|         template = ctx.file._link_stub,
 | |
|         substitutions = {
 | |
|             "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path,
 | |
|             "FATBINFILE": '"%s"' % fatbin_h.short_path,
 | |
|         },
 | |
|     )
 | |
| 
 | |
|     return [DefaultInfo(files = depset([register_h, fatbin_h]))]
 | |
| 
 | |
| _device_link = rule(
 | |
|     implementation = _device_link_impl,
 | |
|     attrs = {
 | |
|         "deps": attr.label_list(),
 | |
|         "out": attr.output(mandatory = True),
 | |
|         "gpu_archs": attr.string_list(),
 | |
|         "nvlink_args": attr.string_list(),
 | |
|         "_nvlink": attr.label(
 | |
|             default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"),
 | |
|             allow_single_file = True,
 | |
|             executable = True,
 | |
|             cfg = "host",
 | |
|         ),
 | |
|         "_fatbinary": attr.label(
 | |
|             default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"),
 | |
|             allow_single_file = True,
 | |
|             executable = True,
 | |
|             cfg = "host",
 | |
|         ),
 | |
|         "_bin2c": attr.label(
 | |
|             default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"),
 | |
|             allow_single_file = True,
 | |
|             executable = True,
 | |
|             cfg = "host",
 | |
|         ),
 | |
|         "_link_stub": attr.label(
 | |
|             default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"),
 | |
|             allow_single_file = True,
 | |
|         ),
 | |
|     },
 | |
| )
 | |
| """Links device code and generates source code for kernel registration."""
 | |
| 
 | |
| def _prune_relocatable_code_impl(ctx):
 | |
|     """Clears __nv_relfatbin section containing relocatable device code."""
 | |
| 
 | |
|     if _cuda_version < (11, 3):
 | |
|         # -no-relocatable-elf not supported, return unpruned input.
 | |
|         return ctx.attr.input[DefaultInfo]
 | |
| 
 | |
|     # nvcc --generate-code options for the active set of cuda architectures.
 | |
|     gencodes = []
 | |
|     for code in ctx.attr.gpu_archs:
 | |
|         arch = code.replace("compute_", "sm_")
 | |
|         if code != arch:
 | |
|             gencodes.append((arch, arch))
 | |
|         gencodes.append((arch, code))
 | |
| 
 | |
|     outputs = []
 | |
|     for input in ctx.files.input:
 | |
|         output = ctx.actions.declare_file(
 | |
|             "pruned_" + input.basename,
 | |
|             sibling = input,
 | |
|         )
 | |
|         arguments = (
 | |
|             ["--generate-code=arch=%s,code=%s" % code for code in gencodes] +
 | |
|             ["-no-relocatable-elf", "--output-file=%s" % output.path, str(input.path)]
 | |
|         )
 | |
|         ctx.actions.run(
 | |
|             outputs = [output],
 | |
|             inputs = [input],
 | |
|             executable = ctx.file._nvprune,
 | |
|             arguments = arguments,
 | |
|             mnemonic = "nvprune",
 | |
|         )
 | |
|         outputs.append(output)
 | |
| 
 | |
|     return DefaultInfo(files = depset(outputs))
 | |
| 
 | |
| _prune_relocatable_code = rule(
 | |
|     implementation = _prune_relocatable_code_impl,
 | |
|     attrs = {
 | |
|         "input": attr.label(mandatory = True, allow_files = True),
 | |
|         "gpu_archs": attr.string_list(),
 | |
|         "_nvprune": attr.label(
 | |
|             default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"),
 | |
|             allow_single_file = True,
 | |
|             executable = True,
 | |
|             cfg = "host",
 | |
|         ),
 | |
|     },
 | |
| )
 | |
| 
 | |
| def _merge_archive_impl(ctx):
 | |
|     # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
 | |
|     # See https://stackoverflow.com/a/23621751.
 | |
|     files = _pic_only(ctx.files.srcs)
 | |
|     mri_script = "create " + ctx.outputs.out.path
 | |
|     for f in files:
 | |
|         mri_script += r"\naddlib " + f.path
 | |
|     mri_script += r"\nsave\nend"
 | |
| 
 | |
|     cc_toolchain = find_cpp_toolchain(ctx)
 | |
|     ctx.actions.run_shell(
 | |
|         inputs = ctx.files.srcs,  # + ctx.files._crosstool,
 | |
|         outputs = [ctx.outputs.out],
 | |
|         command = "echo -e \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
 | |
|     )
 | |
| 
 | |
| _merge_archive = rule(
 | |
|     implementation = _merge_archive_impl,
 | |
|     attrs = {
 | |
|         "srcs": attr.label_list(mandatory = True, allow_files = True),
 | |
|         "_cc_toolchain": attr.label(
 | |
|             default = "@bazel_tools//tools/cpp:current_cc_toolchain",
 | |
|         ),
 | |
|         # "_crosstool": attr.label_list(
 | |
|         #     cfg = "host",
 | |
|         #     default = ["@bazel_tools//tools/cpp:crosstool"]
 | |
|         # ),
 | |
|     },
 | |
|     outputs = {"out": "lib%{name}.a"},
 | |
| )
 | |
| """Merges srcs into a single archive."""
 | |
| 
 | |
| def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
 | |
|     r"""Produces a cuda_library using separate compilation and linking.
 | |
| 
 | |
|     CUDA separate compilation and linking allows device function calls across
 | |
|     translation units. This is different from the normal whole program
 | |
|     compilation where each translation unit contains all device code. For more
 | |
|     background, see
 | |
|     https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/,
 | |
|     https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation
 | |
| 
 | |
|     During separate compilation, the different CUDA source files are compiled
 | |
|     to 'relocatable device code' (RDC) and embedded in the host object files.
 | |
|     When using nvcc, linking the device code for each supported GPU
 | |
|     architecture and generating kernel registration code for the CUDA runtime
 | |
|     is handled automatically. Clang supports generating relocatable device
 | |
|     code, but it can't link it. We therefore rely on tools provided by the CUDA
 | |
|     SDK to link the device code and generate the host code to register the
 | |
|     kernels.
 | |
| 
 | |
|     The nvlink tool extracts the RDC code from the object files and links it
 | |
|     into cubin files, one per GPU architecture. It also produces a header file
 | |
|     with a list of kernel names to register. The cubins are merged into a
 | |
|     binary blob using the fatbinary tool, and converted to a C header file with
 | |
|     the help of the bin2c tool. The registration header file, the fatbinary
 | |
|     header file, and the link.stub file (shipped with the CUDA SDK) are
 | |
|     compiled as ordinary host code.
 | |
| 
 | |
|     Here is a diagram of the CUDA separate compilation trajectory:
 | |
| 
 | |
|      x.cu.cc    y.cu.cc
 | |
|            \    /            cc_library (compile RDC and archive)
 | |
|             xy.a
 | |
|            /    \            * nvlink
 | |
|     register.h  xy.cubin
 | |
|           :      |           * fatbinary and bin2c
 | |
|           :     xy.fatbin.h
 | |
|           :      :           * #include
 | |
|           dlink.cc           * Expanded from crt/dlink.stub template
 | |
|              |               cc_library (host compile and archive)
 | |
|           dlink.a
 | |
| 
 | |
|     The steps marked with '*' are implemented in the _device_link rule.
 | |
| 
 | |
|     The intermediate relocatable device code in xy.a is no longer needed at
 | |
|     this point and the corresponding section is replaced with an empty one using
 | |
|     objcopy. We do not remove the section completely because it is referenced by
 | |
|     relocations, and removing those as well breaks fatbin registration.
 | |
| 
 | |
|     The object files in both xy.a and dlink.a reference symbols defined in the
 | |
|     other archive. The separate archives are a side effect of using two
 | |
|     cc_library targets to implement a single compilation trajectory. We could
 | |
|     fix this once bazel supports C++ sandwich. For now, we just merge the two
 | |
|     archives to avoid unresolved symbols:
 | |
| 
 | |
|                     xy.a
 | |
|                      |         objcopy --update-section __nv_relfatbin=''
 | |
|     dlink.a     xy_pruned.a
 | |
|          \           /         merge archive
 | |
|           xy_merged.a
 | |
|               |                cc_library (or alternatively, cc_import)
 | |
|          final target
 | |
| 
 | |
|     Another complication is that cc_library produces (depending on the
 | |
|     configuration) both PIC and non-PIC archives, but the distinction
 | |
|     is hidden from Starlark until C++ sandwich becomes available. We work
 | |
|     around this by dropping the non-PIC files if PIC files are available.
 | |
| 
 | |
|     Args:
 | |
|       name: Target name.
 | |
|       hdrs: Header files.
 | |
|       copts: Compiler options.
 | |
|       linkstatic: Must be true.
 | |
|       **kwargs: Any other arguments.
 | |
|     """
 | |
| 
 | |
|     if not hdrs:
 | |
|         hdrs = []
 | |
|     if not copts:
 | |
|         copts = []
 | |
| 
 | |
|     # Compile host and device code into library.
 | |
|     lib = name + "_lib"
 | |
|     native.cc_library(
 | |
|         name = lib,
 | |
|         hdrs = hdrs,
 | |
|         copts = _rdc_copts() + copts,
 | |
|         linkstatic = linkstatic,
 | |
|         **kwargs
 | |
|     )
 | |
| 
 | |
|     # Generate source file containing linked device code.
 | |
|     dlink_hdrs = name + "_dlink_hdrs"
 | |
|     dlink_cc = name + "_dlink.cc"
 | |
|     _device_link(
 | |
|         name = dlink_hdrs,
 | |
|         deps = [lib],
 | |
|         out = dlink_cc,
 | |
|         gpu_archs = cuda_gpu_architectures(),
 | |
|         nvlink_args = select({
 | |
|             "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
 | |
|             "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
 | |
|             "//conditions:default": [],
 | |
|         }),
 | |
|     )
 | |
| 
 | |
|     # Compile the source file into a library.
 | |
|     dlink = name + "_dlink"
 | |
|     native.cc_library(
 | |
|         name = dlink,
 | |
|         srcs = [dlink_cc],
 | |
|         textual_hdrs = [dlink_hdrs],
 | |
|         deps = [
 | |
|             "@local_config_cuda//cuda:cuda_headers",
 | |
|         ],
 | |
|         defines = [
 | |
|             # Silence warning about including internal header.
 | |
|             "__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__",
 | |
|             # Macros that need to be defined starting with CUDA 10.
 | |
|             "__NV_EXTRA_INITIALIZATION=",
 | |
|             "__NV_EXTRA_FINALIZATION=",
 | |
|         ],
 | |
|         linkstatic = linkstatic,
 | |
|     )
 | |
| 
 | |
|     # Remove intermediate relocatable device code.
 | |
|     pruned = name + "_pruned"
 | |
|     _prune_relocatable_code(
 | |
|         name = pruned,
 | |
|         input = lib,
 | |
|         gpu_archs = cuda_gpu_architectures(),
 | |
|     )
 | |
| 
 | |
|     # Repackage the two libs into a single archive. This is required because
 | |
|     # both libs reference symbols defined in the other one. For details, see
 | |
|     # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
 | |
|     merged = name + "_merged"
 | |
|     _merge_archive(
 | |
|         name = merged,
 | |
|         srcs = [pruned, dlink],
 | |
|     )
 | |
| 
 | |
|     # Create cc target from archive.
 | |
|     native.cc_library(
 | |
|         name = name,
 | |
|         srcs = [merged],
 | |
|         hdrs = hdrs,
 | |
|         linkstatic = linkstatic,
 | |
|     )
 |