分发机制 Dispatcher - 一个empty()方法的分发流程 - 《PyTorch源码剖析》

PyTorch internals. Attacking dispatcher again.
REGISTERING A DISPATCHED OPERATOR IN C++
http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/
https://zhuanlan.zhihu.com/p/64135058

[build/aten/src/Aten/Functions.cpp]
// aten::empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
Tensor empty(IntArrayRef size, const TensorOptions & options, c10::optional<MemoryFormat> memory_format) {
    static auto op = c10::Dispatcher::singleton()
        .findSchemaOrThrow("aten::empty", "memory_format")
        .typed<Tensor (IntArrayRef, c10::optional<ScalarType>, c10::optional<Layout>, c10::optional<Device>, c10::optional<bool>, c10::optional<MemoryFormat>)>();
    return op.call(size, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
}

[aten/src/Aten/core/dispatch/Dispather.cpp]
C10_EXPORT Dispatcher& Dispatcher::singleton() {
  static Dispatcher _singleton;
  return _singleton;
}

返回一个dispathcer

[aten/src/Aten/core/dispatch/Dispather.cpp]
OperatorHandle Dispatcher::findSchemaOrThrow(const char* name, const char* overload_name) {
  auto it = findSchema({name, overload_name});
  if (!it.has_value()) {
    // Check if we have ANYTHING; if that's the case, that means you're
    // missing schema
    auto it2 = findOp({name, overload_name});
    if (!it2.has_value()) {
      TORCH_CHECK(false, "Could not find schema for ", name, ".", overload_name);
    } else {
      TORCH_CHECK(false, "Could not find schema for ", name, ".", overload_name,
        " but we found an implementation; did you forget to def() the operator?");
    }
  }
  return it.value();
}

[aten/src/Aten/core/dispatch/Dispather.cpp]
c10::optional<OperatorHandle> Dispatcher::findSchema(const OperatorName& overload_name) {
  auto it = findOp(overload_name);
  if (it.has_value()) {
    if (it->hasSchema()) {
      return it;
    } else {
      return c10::nullopt;
    }
  } else {
    return it;
  }
}

[aten/src/Aten/core/dispatch/Dispather.cpp]
c10::optional<OperatorHandle> Dispatcher::findOp(const OperatorName& overload_name) {
  return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> c10::optional<OperatorHandle> {
    auto found = operatorLookupTable.find(overload_name);
    if (found == operatorLookupTable.end()) {
      return c10::nullopt;
    }
    return found->second;
  });
}

[aten/src/Aten/core/dispatch/Dispather.h]
Return call(Args... args) const {
  return c10::Dispatcher::singleton().call<Return, Args...>(*this, std::forward<Args>(args)...);
}

[aten/src/Aten/core/dispatch/Dispather.h]
template<class Return, class... Args>
inline Return Dispatcher::call(const TypedOperatorHandle<Return(Args...)>& op, Args... args) const {
  detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
  auto dispatchKey = op.operatorIterator_->op.dispatchKeyExtractor()
    .template getDispatchKeyUnboxed<Args...>(
      DispatchKeySet::FULL,
      args...
    );
  return callWithDispatchKey<Return, Args...>(op, dispatchKey, args...);
}

终于看到dispatchKey了，dispatchKey是分发机制的关键。

[aten/src/Aten/core/dispatch/Dispather.h]
template<class Return, class... Args>
inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandle<Return(Args...)>& op, DispatchKey dispatchKey, Args... args) const {
  detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
  // No alias dispatch key is allowed at runtime.
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKey));
  const KernelFunction& kernel = op.operatorIterator_->op.lookup(dispatchKey);

#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
  // By default, when there're no high-frequency or non-sampled callbacks,
  // RecordFunction is pre-sampled as a perf optimization;
  // shouldRunRecordFunction checks whether RecordFunction should be executed,
  // and sets pre_sampled boolean argument value to whether pre-sampling was used -
  // this boolean is passed into RecordFunction to adjust the sampling rates of
  // the callbacks
  bool pre_sampled = false;
  if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
    // Check if we need to run callbacks registered with RecordFunction
    // If true and callbacks need inputs, we box the arguments and pass
    // them into the callbacks and also into the kernel call

    // Note: for perf reasons we wouldn't want to pass arguments into
    // the function call or prematurely box them
    at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled);
    if (C10_UNLIKELY(guard.isActive())) {
      if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) {
        int64_t seq_num = -1;
        // Setting sequence number in the Autograd case to associate
        // the forward range with the coresponding Autograd's node
        if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
          seq_num = at::sequence_number::peek();
        }
        if (guard.needsInputs()) {
          torch::jit::Stack stack = impl::boxArgs(args...);
          guard.before(op, stack, seq_num);
        } else {
          guard.before(op, seq_num);
        }
      }
    }
    // keeping the guard alive while executing the kernel
    return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
  }
#endif  // PYTORCH_DISABLE_PER_OP_PROFILING
  return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
}

const KernelFunction& kernel = op.operatorIterator_->op.lookup(dispatchKey);

[aten/src/Aten/core/dispatch/OperatorEntry.h]
const KernelFunction& lookup(DispatchKey k) const {
  const auto& kernel = dispatchTable_[static_cast<uint8_t>(k)];
  if (C10_UNLIKELY(!kernel.isValid())) {
    reportError(k);
  }
  return kernel;
}

const auto& kernel = dispatchTable_[static_cast(k)];
返回kernel

return kernel.template call(op, std::forward(args)…);

[aten/src/Aten/core/boxing/KernelFunction_impl.h]
template<class Return, class... Args>
inline Return KernelFunction::call(const OperatorHandle& opHandle, Args... args) const {
    // note: Args above is intentionally not Args&&. We don't want perfect
    // forwarding, which would require Args to be deduced, but instead we
    // want callers to explicitly specify the Args.

    if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
        return callUnboxedKernelFunction<Return, Args...>(unboxed_kernel_func_, functor_.get(), std::forward<Args>(args)...);
    }

    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
        boxed_kernel_func_ != nullptr,
        "Tried to call KernelFunction::call() on an uninitialized KernelFunction."
    );

    return impl::BoxedKernelWrapper<Return(Args...)>::call(
        boxed_kernel_func_,
        functor_.get(),
        opHandle,
        std::forward<Args>(args)...
    );
}

callUnboxedKernelFunction(unboxedkernel_func, functor_.get(), std::forward(args)…);

[aten/src/Aten/core/boxing/KernelFunction_impl.h]
inline Return callUnboxedKernelFunction(void* unboxed_kernel_func, OperatorKernel* functor, Args&&... args) {
    using ActualSignature = Return (OperatorKernel*, Args...);
    ActualSignature* func = reinterpret_cast<ActualSignature*>(unboxed_kernel_func);
    return (*func)(functor, std::forward<Args>(args)...);
}

[aten/src/Aten/core/boxing/impl/make_boxed_from_unboxed_functor.h]
static ReturnType call(OperatorKernel* functor, ParameterTypes... args) {
  KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
  return (*functor_)(std::forward<ParameterTypes>(args)...);
}

[build/aten/src/Aten/RegisterBackendSelect.cpp]
// aten::empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
Tensor empty_memory_format(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format) {
  static auto op = c10::Dispatcher::singleton()
    .findSchemaOrThrow("aten::empty", "memory_format")
    .typed<Tensor (IntArrayRef, c10::optional<ScalarType>, c10::optional<Layout>, c10::optional<Device>, c10::optional<bool>, c10::optional<MemoryFormat>)>();
  DispatchKey _dk = c10::computeDispatchKey(dtype, layout, device);
  return op.callWithDispatchKey(_dk, size, dtype, layout, device, pin_memory, memory_format);
}

[c10/core/TensorOptions.h]
// This is intended to be a centralized location by which we can determine
// what an appropriate DispatchKey for a tensor is.
inline DispatchKey computeDispatchKey(c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device) {
  const auto layout_ = layout_or_default(layout);
  const auto device_ = device_or_default(device);
  switch (layout_) {
      case Layout::Strided: {
        const auto dtype_ = dtype_or_default(dtype);
        switch (device_.type()) {
          case DeviceType::CPU: {
            if (isQIntType(dtype_)) {
              return DispatchKey::QuantizedCPU;
            }
            return DispatchKey::CPU;
          }
          case DeviceType::CUDA: {
            if (isQIntType(dtype_)) {
              return DispatchKey::QuantizedCUDA;
            }
            return DispatchKey::CUDA;
          }
          case DeviceType::MKLDNN:
            return DispatchKey::MKLDNN;
          case DeviceType::OPENGL:
            return DispatchKey::OpenGL;
          case DeviceType::OPENCL:
            return DispatchKey::OpenCL;
          case DeviceType::IDEEP:
            return DispatchKey::IDEEP;
          case DeviceType::HIP:
            return DispatchKey::HIP;
          case DeviceType::FPGA:
            return DispatchKey::FPGA;
          case DeviceType::MSNPU:
            return DispatchKey::MSNPU;
          case DeviceType::XLA:
            return DispatchKey::XLA;
          case DeviceType::Vulkan:
            return DispatchKey::Vulkan;
          case DeviceType::Metal:
            return DispatchKey::Metal;
          default:
            AT_ERROR("Unsupported device type for dense layout: ", device_.type());
        }
      }
      case Layout::Sparse:
        switch (device_.type()) {
          case DeviceType::CPU:
            return DispatchKey::SparseCPU;
          case DeviceType::CUDA:
            return DispatchKey::SparseCUDA;
          case DeviceType::HIP:
            return DispatchKey::SparseHIP;
          default:
            AT_ERROR("Unsupported device type for sparse layout: ", device_.type());
        }
      case Layout::Mkldnn:
        switch (device_.type()) {
          case DeviceType::CPU:
            return DispatchKey::MkldnnCPU;
          default:
            AT_ERROR("Unsupported device type for mkldnn layout: ", device_.type());
        }
      default:
        AT_ERROR("Unsupported layout: ", layout_);
    }
}

return DispatchKey::CPU;

DispatchKey = DispatchKey::CPU;

现在新的DispatchKey已经得到

[aten/src/Aten/core/dispatch/Dispather.h]
Return callWithDispatchKey(DispatchKey dispatchKey, Args... args) const {
  return c10::Dispatcher::singleton().callWithDispatchKey<Return, Args...>(*this, dispatchKey, std::forward<Args>(args)...);
}

[aten/src/Aten/core/dispatch/Dispather.h]
inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandle<Return(Args...)>& op, DispatchKey dispatchKey, Args... args) const {
  detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
  // No alias dispatch key is allowed at runtime.
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKey));
  const KernelFunction& kernel = op.operatorIterator_->op.lookup(dispatchKey);

#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
  // By default, when there're no high-frequency or non-sampled callbacks,
  // RecordFunction is pre-sampled as a perf optimization;
  // shouldRunRecordFunction checks whether RecordFunction should be executed,
  // and sets pre_sampled boolean argument value to whether pre-sampling was used -
  // this boolean is passed into RecordFunction to adjust the sampling rates of
  // the callbacks
  bool pre_sampled = false;
  if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
    // Check if we need to run callbacks registered with RecordFunction
    // If true and callbacks need inputs, we box the arguments and pass
    // them into the callbacks and also into the kernel call

    // Note: for perf reasons we wouldn't want to pass arguments into
    // the function call or prematurely box them
    at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled);
    if (C10_UNLIKELY(guard.isActive())) {
      if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) {
        int64_t seq_num = -1;
        // Setting sequence number in the Autograd case to associate
        // the forward range with the coresponding Autograd's node
        if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
          seq_num = at::sequence_number::peek();
        }
        if (guard.needsInputs()) {
          torch::jit::Stack stack = impl::boxArgs(args...);
          guard.before(op, stack, seq_num);
        } else {
          guard.before(op, seq_num);
        }
      }
    }
    // keeping the guard alive while executing the kernel
    return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
  }
#endif  // PYTORCH_DISABLE_PER_OP_PROFILING
  return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
}

return kernel.template call(op, std::forward(args)…);

[aten/src/Aten/core/boxing/KernelFunction_impl.h]
template<class Return, class... Args>
inline Return KernelFunction::call(const OperatorHandle& opHandle, Args... args) const {
    // note: Args above is intentionally not Args&&. We don't want perfect
    // forwarding, which would require Args to be deduced, but instead we
    // want callers to explicitly specify the Args.

    if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
        return callUnboxedKernelFunction<Return, Args...>(unboxed_kernel_func_, functor_.get(), std::forward<Args>(args)...);
    }

    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
        boxed_kernel_func_ != nullptr,
        "Tried to call KernelFunction::call() on an uninitialized KernelFunction."
    );

    return impl::BoxedKernelWrapper<Return(Args...)>::call(
        boxed_kernel_func_,
        functor_.get(),
        opHandle,
        std::forward<Args>(args)...
    );
}

return callUnboxedKernelFunction(unboxedkernel_func, functor_.get(), std::forward(args)…);

[aten/src/Aten/core/boxing/KernelFunction_impl.h]
template<class Return, class... Args>
inline Return callUnboxedKernelFunction(void* unboxed_kernel_func, OperatorKernel* functor, Args&&... args) {
    using ActualSignature = Return (OperatorKernel*, Args...);
    ActualSignature* func = reinterpret_cast<ActualSignature*>(unboxed_kernel_func);
    return (*func)(functor, std::forward<Args>(args)...);
}

[aten/src/Aten/core/boxing/impl/make_boxed_from_unboxed_functor.h]
static ReturnType call(OperatorKernel* functor, ParameterTypes... args) {
  KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
  return (*functor_)(std::forward<ParameterTypes>(args)...);
}

[aten/src/Aten/core/boxing/impl/WrapFunctionIntoFunction.h]
decltype(auto) operator()(Parameters... args) {
  return (*FuncPtr::func_ptr())(std::forward<Parameters>(args)...);
}

[build/aten/src/Aten/RegisterCPU.cpp]
Tensor empty_memory_format(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format) {
    return at::native::empty_cpu(size, dtype, layout, device, pin_memory, memory_format);
}

[aten/src/Aten/native/TensorFactories.cpp]
Tensor empty_cpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
                 c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
  return at::detail::empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
}

[aten/src/Aten/Utils.cpp]
Tensor empty_cpu(
    IntArrayRef size,
    c10::optional<ScalarType> dtype_opt,
    c10::optional<Layout> layout_opt,
    c10::optional<Device> device_opt,
    c10::optional<bool> pin_memory_opt,
    c10::optional<c10::MemoryFormat> memory_format_opt) {
  Device device = device_or_default(device_opt);

  TORCH_CHECK(device.type() == DeviceType::CPU);
  check_size_nonnegative(size);

  bool pin_memory = pinned_memory_or_default(pin_memory_opt);
  c10::Allocator* allocator;
  if (pin_memory) {
    allocator = detail::getCUDAHooks().getPinnedMemoryAllocator();
  } else {
    allocator = at::getCPUAllocator();
  }

  int64_t nelements = prod_intlist(size);
  caffe2::TypeMeta dtype = scalarTypeToTypeMeta(dtype_or_default(dtype_opt));
  int64_t size_bytes = nelements * dtype.itemsize();
  auto storage_impl = c10::make_intrusive<StorageImpl>(
      c10::StorageImpl::use_byte_size_t(),
      size_bytes,
      allocator->allocate(size_bytes),
      allocator,
      /*resizeable=*/true);

  auto tensor = detail::make_tensor<TensorImpl>(
      std::move(storage_impl), at::DispatchKey::CPU, dtype);
  // Default TensorImpl has size [0]
  if (size.size() != 1 || size[0] != 0) {
    tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
  }

  if (memory_format_opt.has_value()) {
    // Restriding a just-created empty contiguous tensor does nothing.
    if (*memory_format_opt != MemoryFormat::Contiguous) {
      tensor.unsafeGetTensorImpl()->empty_tensor_restride(*memory_format_opt);
    }
  }

  return tensor;
}

allocator = at::getCPUAllocator();

int64_t nelements = prod_intlist(size); 元素个数

caffe2::TypeMeta dtype = scalarTypeToTypeMeta(dtype_or_default(dtype_opt)); 数据类型

int64_t size_bytes = nelements * dtype.itemsize(); 需要的字节数

auto storage_impl = c10::make_intrusive(
c10::StorageImpl::use_byte_size_t(),
size_bytes,
allocator->allocate(size_bytes),
allocator,
/resizeable=/true); 实例化storage_impl

[build/aten/src/Aten/core/TensorBody.h]
Tensor make_tensor(Args&&... args) {
  return Tensor(c10::make_intrusive<T>(std::forward<Args>(args)...));
}