PyTorch internals. Attacking dispatcher again.
    REGISTERING A DISPATCHED OPERATOR IN C++
    http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/
    https://zhuanlan.zhihu.com/p/64135058

    1. [build/aten/src/Aten/Functions.cpp]
    2. // aten::empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
    3. Tensor empty(IntArrayRef size, const TensorOptions & options, c10::optional<MemoryFormat> memory_format) {
    4. static auto op = c10::Dispatcher::singleton()
    5. .findSchemaOrThrow("aten::empty", "memory_format")
    6. .typed<Tensor (IntArrayRef, c10::optional<ScalarType>, c10::optional<Layout>, c10::optional<Device>, c10::optional<bool>, c10::optional<MemoryFormat>)>();
    7. return op.call(size, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
    8. }
    [aten/src/Aten/core/dispatch/Dispather.cpp]
    C10_EXPORT Dispatcher& Dispatcher::singleton() {
      static Dispatcher _singleton;
      return _singleton;
    }
    

    返回一个dispathcer

    [aten/src/Aten/core/dispatch/Dispather.cpp]
    OperatorHandle Dispatcher::findSchemaOrThrow(const char* name, const char* overload_name) {
      auto it = findSchema({name, overload_name});
      if (!it.has_value()) {
        // Check if we have ANYTHING; if that's the case, that means you're
        // missing schema
        auto it2 = findOp({name, overload_name});
        if (!it2.has_value()) {
          TORCH_CHECK(false, "Could not find schema for ", name, ".", overload_name);
        } else {
          TORCH_CHECK(false, "Could not find schema for ", name, ".", overload_name,
            " but we found an implementation; did you forget to def() the operator?");
        }
      }
      return it.value();
    }
    
    [aten/src/Aten/core/dispatch/Dispather.cpp]
    c10::optional<OperatorHandle> Dispatcher::findSchema(const OperatorName& overload_name) {
      auto it = findOp(overload_name);
      if (it.has_value()) {
        if (it->hasSchema()) {
          return it;
        } else {
          return c10::nullopt;
        }
      } else {
        return it;
      }
    }
    
    [aten/src/Aten/core/dispatch/Dispather.cpp]
    c10::optional<OperatorHandle> Dispatcher::findOp(const OperatorName& overload_name) {
      return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> c10::optional<OperatorHandle> {
        auto found = operatorLookupTable.find(overload_name);
        if (found == operatorLookupTable.end()) {
          return c10::nullopt;
        }
        return found->second;
      });
    }
    
    [aten/src/Aten/core/dispatch/Dispather.h]
    Return call(Args... args) const {
      return c10::Dispatcher::singleton().call<Return, Args...>(*this, std::forward<Args>(args)...);
    }
    
    [aten/src/Aten/core/dispatch/Dispather.h]
    template<class Return, class... Args>
    inline Return Dispatcher::call(const TypedOperatorHandle<Return(Args...)>& op, Args... args) const {
      detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
      auto dispatchKey = op.operatorIterator_->op.dispatchKeyExtractor()
        .template getDispatchKeyUnboxed<Args...>(
          DispatchKeySet::FULL,
          args...
        );
      return callWithDispatchKey<Return, Args...>(op, dispatchKey, args...);
    }
    

    终于看到dispatchKey了,dispatchKey是分发机制的关键。

    [aten/src/Aten/core/dispatch/Dispather.h]
    template<class Return, class... Args>
    inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandle<Return(Args...)>& op, DispatchKey dispatchKey, Args... args) const {
      detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
      // No alias dispatch key is allowed at runtime.
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKey));
      const KernelFunction& kernel = op.operatorIterator_->op.lookup(dispatchKey);
    
    #ifndef PYTORCH_DISABLE_PER_OP_PROFILING
      // By default, when there're no high-frequency or non-sampled callbacks,
      // RecordFunction is pre-sampled as a perf optimization;
      // shouldRunRecordFunction checks whether RecordFunction should be executed,
      // and sets pre_sampled boolean argument value to whether pre-sampling was used -
      // this boolean is passed into RecordFunction to adjust the sampling rates of
      // the callbacks
      bool pre_sampled = false;
      if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
        // Check if we need to run callbacks registered with RecordFunction
        // If true and callbacks need inputs, we box the arguments and pass
        // them into the callbacks and also into the kernel call
    
        // Note: for perf reasons we wouldn't want to pass arguments into
        // the function call or prematurely box them
        at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled);
        if (C10_UNLIKELY(guard.isActive())) {
          if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) {
            int64_t seq_num = -1;
            // Setting sequence number in the Autograd case to associate
            // the forward range with the coresponding Autograd's node
            if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
              seq_num = at::sequence_number::peek();
            }
            if (guard.needsInputs()) {
              torch::jit::Stack stack = impl::boxArgs(args...);
              guard.before(op, stack, seq_num);
            } else {
              guard.before(op, seq_num);
            }
          }
        }
        // keeping the guard alive while executing the kernel
        return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
      }
    #endif  // PYTORCH_DISABLE_PER_OP_PROFILING
      return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
    }
    

    const KernelFunction& kernel = op.operatorIterator_->op.lookup(dispatchKey);

    [aten/src/Aten/core/dispatch/OperatorEntry.h]
    const KernelFunction& lookup(DispatchKey k) const {
      const auto& kernel = dispatchTable_[static_cast<uint8_t>(k)];
      if (C10_UNLIKELY(!kernel.isValid())) {
        reportError(k);
      }
      return kernel;
    }
    

    const auto& kernel = dispatchTable_[static_cast(k)];
    返回kernel

    return kernel.template call(op, std::forward(args)…);

    [aten/src/Aten/core/boxing/KernelFunction_impl.h]
    template<class Return, class... Args>
    inline Return KernelFunction::call(const OperatorHandle& opHandle, Args... args) const {
        // note: Args above is intentionally not Args&&. We don't want perfect
        // forwarding, which would require Args to be deduced, but instead we
        // want callers to explicitly specify the Args.
    
        if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
            return callUnboxedKernelFunction<Return, Args...>(unboxed_kernel_func_, functor_.get(), std::forward<Args>(args)...);
        }
    
        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
            boxed_kernel_func_ != nullptr,
            "Tried to call KernelFunction::call() on an uninitialized KernelFunction."
        );
    
        return impl::BoxedKernelWrapper<Return(Args...)>::call(
            boxed_kernel_func_,
            functor_.get(),
            opHandle,
            std::forward<Args>(args)...
        );
    }
    

    callUnboxedKernelFunction(unboxedkernel_func, functor_.get(), std::forward(args)…);

    [aten/src/Aten/core/boxing/KernelFunction_impl.h]
    inline Return callUnboxedKernelFunction(void* unboxed_kernel_func, OperatorKernel* functor, Args&&... args) {
        using ActualSignature = Return (OperatorKernel*, Args...);
        ActualSignature* func = reinterpret_cast<ActualSignature*>(unboxed_kernel_func);
        return (*func)(functor, std::forward<Args>(args)...);
    }
    
    [aten/src/Aten/core/boxing/impl/make_boxed_from_unboxed_functor.h]
    static ReturnType call(OperatorKernel* functor, ParameterTypes... args) {
      KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
      return (*functor_)(std::forward<ParameterTypes>(args)...);
    }
    
    [build/aten/src/Aten/RegisterBackendSelect.cpp]
    // aten::empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
    Tensor empty_memory_format(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format) {
      static auto op = c10::Dispatcher::singleton()
        .findSchemaOrThrow("aten::empty", "memory_format")
        .typed<Tensor (IntArrayRef, c10::optional<ScalarType>, c10::optional<Layout>, c10::optional<Device>, c10::optional<bool>, c10::optional<MemoryFormat>)>();
      DispatchKey _dk = c10::computeDispatchKey(dtype, layout, device);
      return op.callWithDispatchKey(_dk, size, dtype, layout, device, pin_memory, memory_format);
    }
    
    [c10/core/TensorOptions.h]
    // This is intended to be a centralized location by which we can determine
    // what an appropriate DispatchKey for a tensor is.
    inline DispatchKey computeDispatchKey(c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device) {
      const auto layout_ = layout_or_default(layout);
      const auto device_ = device_or_default(device);
      switch (layout_) {
          case Layout::Strided: {
            const auto dtype_ = dtype_or_default(dtype);
            switch (device_.type()) {
              case DeviceType::CPU: {
                if (isQIntType(dtype_)) {
                  return DispatchKey::QuantizedCPU;
                }
                return DispatchKey::CPU;
              }
              case DeviceType::CUDA: {
                if (isQIntType(dtype_)) {
                  return DispatchKey::QuantizedCUDA;
                }
                return DispatchKey::CUDA;
              }
              case DeviceType::MKLDNN:
                return DispatchKey::MKLDNN;
              case DeviceType::OPENGL:
                return DispatchKey::OpenGL;
              case DeviceType::OPENCL:
                return DispatchKey::OpenCL;
              case DeviceType::IDEEP:
                return DispatchKey::IDEEP;
              case DeviceType::HIP:
                return DispatchKey::HIP;
              case DeviceType::FPGA:
                return DispatchKey::FPGA;
              case DeviceType::MSNPU:
                return DispatchKey::MSNPU;
              case DeviceType::XLA:
                return DispatchKey::XLA;
              case DeviceType::Vulkan:
                return DispatchKey::Vulkan;
              case DeviceType::Metal:
                return DispatchKey::Metal;
              default:
                AT_ERROR("Unsupported device type for dense layout: ", device_.type());
            }
          }
          case Layout::Sparse:
            switch (device_.type()) {
              case DeviceType::CPU:
                return DispatchKey::SparseCPU;
              case DeviceType::CUDA:
                return DispatchKey::SparseCUDA;
              case DeviceType::HIP:
                return DispatchKey::SparseHIP;
              default:
                AT_ERROR("Unsupported device type for sparse layout: ", device_.type());
            }
          case Layout::Mkldnn:
            switch (device_.type()) {
              case DeviceType::CPU:
                return DispatchKey::MkldnnCPU;
              default:
                AT_ERROR("Unsupported device type for mkldnn layout: ", device_.type());
            }
          default:
            AT_ERROR("Unsupported layout: ", layout_);
        }
    }
    

    return DispatchKey::CPU;

    DispatchKey = DispatchKey::CPU;

    现在新的DispatchKey已经得到

    [aten/src/Aten/core/dispatch/Dispather.h]
    Return callWithDispatchKey(DispatchKey dispatchKey, Args... args) const {
      return c10::Dispatcher::singleton().callWithDispatchKey<Return, Args...>(*this, dispatchKey, std::forward<Args>(args)...);
    }
    
    [aten/src/Aten/core/dispatch/Dispather.h]
    inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandle<Return(Args...)>& op, DispatchKey dispatchKey, Args... args) const {
      detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
      // No alias dispatch key is allowed at runtime.
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKey));
      const KernelFunction& kernel = op.operatorIterator_->op.lookup(dispatchKey);
    
    #ifndef PYTORCH_DISABLE_PER_OP_PROFILING
      // By default, when there're no high-frequency or non-sampled callbacks,
      // RecordFunction is pre-sampled as a perf optimization;
      // shouldRunRecordFunction checks whether RecordFunction should be executed,
      // and sets pre_sampled boolean argument value to whether pre-sampling was used -
      // this boolean is passed into RecordFunction to adjust the sampling rates of
      // the callbacks
      bool pre_sampled = false;
      if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
        // Check if we need to run callbacks registered with RecordFunction
        // If true and callbacks need inputs, we box the arguments and pass
        // them into the callbacks and also into the kernel call
    
        // Note: for perf reasons we wouldn't want to pass arguments into
        // the function call or prematurely box them
        at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled);
        if (C10_UNLIKELY(guard.isActive())) {
          if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) {
            int64_t seq_num = -1;
            // Setting sequence number in the Autograd case to associate
            // the forward range with the coresponding Autograd's node
            if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
              seq_num = at::sequence_number::peek();
            }
            if (guard.needsInputs()) {
              torch::jit::Stack stack = impl::boxArgs(args...);
              guard.before(op, stack, seq_num);
            } else {
              guard.before(op, seq_num);
            }
          }
        }
        // keeping the guard alive while executing the kernel
        return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
      }
    #endif  // PYTORCH_DISABLE_PER_OP_PROFILING
      return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
    }
    

    return kernel.template call(op, std::forward(args)…);

    [aten/src/Aten/core/boxing/KernelFunction_impl.h]
    template<class Return, class... Args>
    inline Return KernelFunction::call(const OperatorHandle& opHandle, Args... args) const {
        // note: Args above is intentionally not Args&&. We don't want perfect
        // forwarding, which would require Args to be deduced, but instead we
        // want callers to explicitly specify the Args.
    
        if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
            return callUnboxedKernelFunction<Return, Args...>(unboxed_kernel_func_, functor_.get(), std::forward<Args>(args)...);
        }
    
        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
            boxed_kernel_func_ != nullptr,
            "Tried to call KernelFunction::call() on an uninitialized KernelFunction."
        );
    
        return impl::BoxedKernelWrapper<Return(Args...)>::call(
            boxed_kernel_func_,
            functor_.get(),
            opHandle,
            std::forward<Args>(args)...
        );
    }
    

    return callUnboxedKernelFunction(unboxedkernel_func, functor_.get(), std::forward(args)…);

    [aten/src/Aten/core/boxing/KernelFunction_impl.h]
    template<class Return, class... Args>
    inline Return callUnboxedKernelFunction(void* unboxed_kernel_func, OperatorKernel* functor, Args&&... args) {
        using ActualSignature = Return (OperatorKernel*, Args...);
        ActualSignature* func = reinterpret_cast<ActualSignature*>(unboxed_kernel_func);
        return (*func)(functor, std::forward<Args>(args)...);
    }
    
    [aten/src/Aten/core/boxing/impl/make_boxed_from_unboxed_functor.h]
    static ReturnType call(OperatorKernel* functor, ParameterTypes... args) {
      KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
      return (*functor_)(std::forward<ParameterTypes>(args)...);
    }
    
    [aten/src/Aten/core/boxing/impl/WrapFunctionIntoFunction.h]
    decltype(auto) operator()(Parameters... args) {
      return (*FuncPtr::func_ptr())(std::forward<Parameters>(args)...);
    }
    
    [build/aten/src/Aten/RegisterCPU.cpp]
    Tensor empty_memory_format(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format) {
        return at::native::empty_cpu(size, dtype, layout, device, pin_memory, memory_format);
    }
    
    [aten/src/Aten/native/TensorFactories.cpp]
    Tensor empty_cpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
                     c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
      return at::detail::empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
    }
    
    [aten/src/Aten/Utils.cpp]
    Tensor empty_cpu(
        IntArrayRef size,
        c10::optional<ScalarType> dtype_opt,
        c10::optional<Layout> layout_opt,
        c10::optional<Device> device_opt,
        c10::optional<bool> pin_memory_opt,
        c10::optional<c10::MemoryFormat> memory_format_opt) {
      Device device = device_or_default(device_opt);
    
      TORCH_CHECK(device.type() == DeviceType::CPU);
      check_size_nonnegative(size);
    
      bool pin_memory = pinned_memory_or_default(pin_memory_opt);
      c10::Allocator* allocator;
      if (pin_memory) {
        allocator = detail::getCUDAHooks().getPinnedMemoryAllocator();
      } else {
        allocator = at::getCPUAllocator();
      }
    
      int64_t nelements = prod_intlist(size);
      caffe2::TypeMeta dtype = scalarTypeToTypeMeta(dtype_or_default(dtype_opt));
      int64_t size_bytes = nelements * dtype.itemsize();
      auto storage_impl = c10::make_intrusive<StorageImpl>(
          c10::StorageImpl::use_byte_size_t(),
          size_bytes,
          allocator->allocate(size_bytes),
          allocator,
          /*resizeable=*/true);
    
      auto tensor = detail::make_tensor<TensorImpl>(
          std::move(storage_impl), at::DispatchKey::CPU, dtype);
      // Default TensorImpl has size [0]
      if (size.size() != 1 || size[0] != 0) {
        tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
      }
    
      if (memory_format_opt.has_value()) {
        // Restriding a just-created empty contiguous tensor does nothing.
        if (*memory_format_opt != MemoryFormat::Contiguous) {
          tensor.unsafeGetTensorImpl()->empty_tensor_restride(*memory_format_opt);
        }
      }
    
      return tensor;
    }
    

    allocator = at::getCPUAllocator();

    int64_t nelements = prod_intlist(size); 元素个数

    caffe2::TypeMeta dtype = scalarTypeToTypeMeta(dtype_or_default(dtype_opt)); 数据类型

    int64_t size_bytes = nelements * dtype.itemsize(); 需要的字节数

    auto storage_impl = c10::make_intrusive(
    c10::StorageImpl::use_byte_size_t(),
    size_bytes,
    allocator->allocate(size_bytes),
    allocator,
    /resizeable=/true); 实例化storage_impl

    [build/aten/src/Aten/core/TensorBody.h]
    Tensor make_tensor(Args&&... args) {
      return Tensor(c10::make_intrusive<T>(std::forward<Args>(args)...));
    }