PyTorch internals. Attacking dispatcher again.
REGISTERING A DISPATCHED OPERATOR IN C++
http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/
https://zhuanlan.zhihu.com/p/64135058
[build/aten/src/Aten/Functions.cpp]
// aten::empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
Tensor empty(IntArrayRef size, const TensorOptions & options, c10::optional<MemoryFormat> memory_format) {
static auto op = c10::Dispatcher::singleton()
.findSchemaOrThrow("aten::empty", "memory_format")
.typed<Tensor (IntArrayRef, c10::optional<ScalarType>, c10::optional<Layout>, c10::optional<Device>, c10::optional<bool>, c10::optional<MemoryFormat>)>();
return op.call(size, optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
}
[aten/src/Aten/core/dispatch/Dispather.cpp]
C10_EXPORT Dispatcher& Dispatcher::singleton() {
static Dispatcher _singleton;
return _singleton;
}
返回一个dispathcer
[aten/src/Aten/core/dispatch/Dispather.cpp]
OperatorHandle Dispatcher::findSchemaOrThrow(const char* name, const char* overload_name) {
auto it = findSchema({name, overload_name});
if (!it.has_value()) {
// Check if we have ANYTHING; if that's the case, that means you're
// missing schema
auto it2 = findOp({name, overload_name});
if (!it2.has_value()) {
TORCH_CHECK(false, "Could not find schema for ", name, ".", overload_name);
} else {
TORCH_CHECK(false, "Could not find schema for ", name, ".", overload_name,
" but we found an implementation; did you forget to def() the operator?");
}
}
return it.value();
}
[aten/src/Aten/core/dispatch/Dispather.cpp]
c10::optional<OperatorHandle> Dispatcher::findSchema(const OperatorName& overload_name) {
auto it = findOp(overload_name);
if (it.has_value()) {
if (it->hasSchema()) {
return it;
} else {
return c10::nullopt;
}
} else {
return it;
}
}
[aten/src/Aten/core/dispatch/Dispather.cpp]
c10::optional<OperatorHandle> Dispatcher::findOp(const OperatorName& overload_name) {
return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> c10::optional<OperatorHandle> {
auto found = operatorLookupTable.find(overload_name);
if (found == operatorLookupTable.end()) {
return c10::nullopt;
}
return found->second;
});
}
[aten/src/Aten/core/dispatch/Dispather.h]
Return call(Args... args) const {
return c10::Dispatcher::singleton().call<Return, Args...>(*this, std::forward<Args>(args)...);
}
[aten/src/Aten/core/dispatch/Dispather.h]
template<class Return, class... Args>
inline Return Dispatcher::call(const TypedOperatorHandle<Return(Args...)>& op, Args... args) const {
detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5
auto dispatchKey = op.operatorIterator_->op.dispatchKeyExtractor()
.template getDispatchKeyUnboxed<Args...>(
DispatchKeySet::FULL,
args...
);
return callWithDispatchKey<Return, Args...>(op, dispatchKey, args...);
}
终于看到dispatchKey了,dispatchKey是分发机制的关键。
[aten/src/Aten/core/dispatch/Dispather.h]
template<class Return, class... Args>
inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandle<Return(Args...)>& op, DispatchKey dispatchKey, Args... args) const {
detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5
// No alias dispatch key is allowed at runtime.
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKey));
const KernelFunction& kernel = op.operatorIterator_->op.lookup(dispatchKey);
#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
// By default, when there're no high-frequency or non-sampled callbacks,
// RecordFunction is pre-sampled as a perf optimization;
// shouldRunRecordFunction checks whether RecordFunction should be executed,
// and sets pre_sampled boolean argument value to whether pre-sampling was used -
// this boolean is passed into RecordFunction to adjust the sampling rates of
// the callbacks
bool pre_sampled = false;
if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
// Check if we need to run callbacks registered with RecordFunction
// If true and callbacks need inputs, we box the arguments and pass
// them into the callbacks and also into the kernel call
// Note: for perf reasons we wouldn't want to pass arguments into
// the function call or prematurely box them
at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled);
if (C10_UNLIKELY(guard.isActive())) {
if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) {
int64_t seq_num = -1;
// Setting sequence number in the Autograd case to associate
// the forward range with the coresponding Autograd's node
if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
seq_num = at::sequence_number::peek();
}
if (guard.needsInputs()) {
torch::jit::Stack stack = impl::boxArgs(args...);
guard.before(op, stack, seq_num);
} else {
guard.before(op, seq_num);
}
}
}
// keeping the guard alive while executing the kernel
return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
}
#endif // PYTORCH_DISABLE_PER_OP_PROFILING
return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
}
const KernelFunction& kernel = op.operatorIterator_->op.lookup(dispatchKey);
[aten/src/Aten/core/dispatch/OperatorEntry.h]
const KernelFunction& lookup(DispatchKey k) const {
const auto& kernel = dispatchTable_[static_cast<uint8_t>(k)];
if (C10_UNLIKELY(!kernel.isValid())) {
reportError(k);
}
return kernel;
}
const auto& kernel = dispatchTable_[static_cast
返回kernel
return kernel.template call
[aten/src/Aten/core/boxing/KernelFunction_impl.h]
template<class Return, class... Args>
inline Return KernelFunction::call(const OperatorHandle& opHandle, Args... args) const {
// note: Args above is intentionally not Args&&. We don't want perfect
// forwarding, which would require Args to be deduced, but instead we
// want callers to explicitly specify the Args.
if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
return callUnboxedKernelFunction<Return, Args...>(unboxed_kernel_func_, functor_.get(), std::forward<Args>(args)...);
}
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
boxed_kernel_func_ != nullptr,
"Tried to call KernelFunction::call() on an uninitialized KernelFunction."
);
return impl::BoxedKernelWrapper<Return(Args...)>::call(
boxed_kernel_func_,
functor_.get(),
opHandle,
std::forward<Args>(args)...
);
}
callUnboxedKernelFunction
[aten/src/Aten/core/boxing/KernelFunction_impl.h]
inline Return callUnboxedKernelFunction(void* unboxed_kernel_func, OperatorKernel* functor, Args&&... args) {
using ActualSignature = Return (OperatorKernel*, Args...);
ActualSignature* func = reinterpret_cast<ActualSignature*>(unboxed_kernel_func);
return (*func)(functor, std::forward<Args>(args)...);
}
[aten/src/Aten/core/boxing/impl/make_boxed_from_unboxed_functor.h]
static ReturnType call(OperatorKernel* functor, ParameterTypes... args) {
KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
return (*functor_)(std::forward<ParameterTypes>(args)...);
}
[build/aten/src/Aten/RegisterBackendSelect.cpp]
// aten::empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
Tensor empty_memory_format(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format) {
static auto op = c10::Dispatcher::singleton()
.findSchemaOrThrow("aten::empty", "memory_format")
.typed<Tensor (IntArrayRef, c10::optional<ScalarType>, c10::optional<Layout>, c10::optional<Device>, c10::optional<bool>, c10::optional<MemoryFormat>)>();
DispatchKey _dk = c10::computeDispatchKey(dtype, layout, device);
return op.callWithDispatchKey(_dk, size, dtype, layout, device, pin_memory, memory_format);
}
[c10/core/TensorOptions.h]
// This is intended to be a centralized location by which we can determine
// what an appropriate DispatchKey for a tensor is.
inline DispatchKey computeDispatchKey(c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device) {
const auto layout_ = layout_or_default(layout);
const auto device_ = device_or_default(device);
switch (layout_) {
case Layout::Strided: {
const auto dtype_ = dtype_or_default(dtype);
switch (device_.type()) {
case DeviceType::CPU: {
if (isQIntType(dtype_)) {
return DispatchKey::QuantizedCPU;
}
return DispatchKey::CPU;
}
case DeviceType::CUDA: {
if (isQIntType(dtype_)) {
return DispatchKey::QuantizedCUDA;
}
return DispatchKey::CUDA;
}
case DeviceType::MKLDNN:
return DispatchKey::MKLDNN;
case DeviceType::OPENGL:
return DispatchKey::OpenGL;
case DeviceType::OPENCL:
return DispatchKey::OpenCL;
case DeviceType::IDEEP:
return DispatchKey::IDEEP;
case DeviceType::HIP:
return DispatchKey::HIP;
case DeviceType::FPGA:
return DispatchKey::FPGA;
case DeviceType::MSNPU:
return DispatchKey::MSNPU;
case DeviceType::XLA:
return DispatchKey::XLA;
case DeviceType::Vulkan:
return DispatchKey::Vulkan;
case DeviceType::Metal:
return DispatchKey::Metal;
default:
AT_ERROR("Unsupported device type for dense layout: ", device_.type());
}
}
case Layout::Sparse:
switch (device_.type()) {
case DeviceType::CPU:
return DispatchKey::SparseCPU;
case DeviceType::CUDA:
return DispatchKey::SparseCUDA;
case DeviceType::HIP:
return DispatchKey::SparseHIP;
default:
AT_ERROR("Unsupported device type for sparse layout: ", device_.type());
}
case Layout::Mkldnn:
switch (device_.type()) {
case DeviceType::CPU:
return DispatchKey::MkldnnCPU;
default:
AT_ERROR("Unsupported device type for mkldnn layout: ", device_.type());
}
default:
AT_ERROR("Unsupported layout: ", layout_);
}
}
return DispatchKey::CPU;
DispatchKey = DispatchKey::CPU;
现在新的DispatchKey已经得到
[aten/src/Aten/core/dispatch/Dispather.h]
Return callWithDispatchKey(DispatchKey dispatchKey, Args... args) const {
return c10::Dispatcher::singleton().callWithDispatchKey<Return, Args...>(*this, dispatchKey, std::forward<Args>(args)...);
}
[aten/src/Aten/core/dispatch/Dispather.h]
inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandle<Return(Args...)>& op, DispatchKey dispatchKey, Args... args) const {
detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5
// No alias dispatch key is allowed at runtime.
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKey));
const KernelFunction& kernel = op.operatorIterator_->op.lookup(dispatchKey);
#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
// By default, when there're no high-frequency or non-sampled callbacks,
// RecordFunction is pre-sampled as a perf optimization;
// shouldRunRecordFunction checks whether RecordFunction should be executed,
// and sets pre_sampled boolean argument value to whether pre-sampling was used -
// this boolean is passed into RecordFunction to adjust the sampling rates of
// the callbacks
bool pre_sampled = false;
if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) {
// Check if we need to run callbacks registered with RecordFunction
// If true and callbacks need inputs, we box the arguments and pass
// them into the callbacks and also into the kernel call
// Note: for perf reasons we wouldn't want to pass arguments into
// the function call or prematurely box them
at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled);
if (C10_UNLIKELY(guard.isActive())) {
if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) {
int64_t seq_num = -1;
// Setting sequence number in the Autograd case to associate
// the forward range with the coresponding Autograd's node
if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) {
seq_num = at::sequence_number::peek();
}
if (guard.needsInputs()) {
torch::jit::Stack stack = impl::boxArgs(args...);
guard.before(op, stack, seq_num);
} else {
guard.before(op, seq_num);
}
}
}
// keeping the guard alive while executing the kernel
return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
}
#endif // PYTORCH_DISABLE_PER_OP_PROFILING
return kernel.template call<Return, Args...>(op, std::forward<Args>(args)...);
}
return kernel.template call
[aten/src/Aten/core/boxing/KernelFunction_impl.h]
template<class Return, class... Args>
inline Return KernelFunction::call(const OperatorHandle& opHandle, Args... args) const {
// note: Args above is intentionally not Args&&. We don't want perfect
// forwarding, which would require Args to be deduced, but instead we
// want callers to explicitly specify the Args.
if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
return callUnboxedKernelFunction<Return, Args...>(unboxed_kernel_func_, functor_.get(), std::forward<Args>(args)...);
}
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
boxed_kernel_func_ != nullptr,
"Tried to call KernelFunction::call() on an uninitialized KernelFunction."
);
return impl::BoxedKernelWrapper<Return(Args...)>::call(
boxed_kernel_func_,
functor_.get(),
opHandle,
std::forward<Args>(args)...
);
}
return callUnboxedKernelFunction
[aten/src/Aten/core/boxing/KernelFunction_impl.h]
template<class Return, class... Args>
inline Return callUnboxedKernelFunction(void* unboxed_kernel_func, OperatorKernel* functor, Args&&... args) {
using ActualSignature = Return (OperatorKernel*, Args...);
ActualSignature* func = reinterpret_cast<ActualSignature*>(unboxed_kernel_func);
return (*func)(functor, std::forward<Args>(args)...);
}
[aten/src/Aten/core/boxing/impl/make_boxed_from_unboxed_functor.h]
static ReturnType call(OperatorKernel* functor, ParameterTypes... args) {
KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
return (*functor_)(std::forward<ParameterTypes>(args)...);
}
[aten/src/Aten/core/boxing/impl/WrapFunctionIntoFunction.h]
decltype(auto) operator()(Parameters... args) {
return (*FuncPtr::func_ptr())(std::forward<Parameters>(args)...);
}
[build/aten/src/Aten/RegisterCPU.cpp]
Tensor empty_memory_format(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format) {
return at::native::empty_cpu(size, dtype, layout, device, pin_memory, memory_format);
}
[aten/src/Aten/native/TensorFactories.cpp]
Tensor empty_cpu(IntArrayRef size, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
return at::detail::empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
}
[aten/src/Aten/Utils.cpp]
Tensor empty_cpu(
IntArrayRef size,
c10::optional<ScalarType> dtype_opt,
c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt,
c10::optional<bool> pin_memory_opt,
c10::optional<c10::MemoryFormat> memory_format_opt) {
Device device = device_or_default(device_opt);
TORCH_CHECK(device.type() == DeviceType::CPU);
check_size_nonnegative(size);
bool pin_memory = pinned_memory_or_default(pin_memory_opt);
c10::Allocator* allocator;
if (pin_memory) {
allocator = detail::getCUDAHooks().getPinnedMemoryAllocator();
} else {
allocator = at::getCPUAllocator();
}
int64_t nelements = prod_intlist(size);
caffe2::TypeMeta dtype = scalarTypeToTypeMeta(dtype_or_default(dtype_opt));
int64_t size_bytes = nelements * dtype.itemsize();
auto storage_impl = c10::make_intrusive<StorageImpl>(
c10::StorageImpl::use_byte_size_t(),
size_bytes,
allocator->allocate(size_bytes),
allocator,
/*resizeable=*/true);
auto tensor = detail::make_tensor<TensorImpl>(
std::move(storage_impl), at::DispatchKey::CPU, dtype);
// Default TensorImpl has size [0]
if (size.size() != 1 || size[0] != 0) {
tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
}
if (memory_format_opt.has_value()) {
// Restriding a just-created empty contiguous tensor does nothing.
if (*memory_format_opt != MemoryFormat::Contiguous) {
tensor.unsafeGetTensorImpl()->empty_tensor_restride(*memory_format_opt);
}
}
return tensor;
}
allocator = at::getCPUAllocator();
int64_t nelements = prod_intlist(size); 元素个数
caffe2::TypeMeta dtype = scalarTypeToTypeMeta(dtype_or_default(dtype_opt)); 数据类型
int64_t size_bytes = nelements * dtype.itemsize(); 需要的字节数
auto storage_impl = c10::make_intrusive
c10::StorageImpl::use_byte_size_t(),
size_bytes,
allocator->allocate(size_bytes),
allocator,
/resizeable=/true); 实例化storage_impl
[build/aten/src/Aten/core/TensorBody.h]
Tensor make_tensor(Args&&... args) {
return Tensor(c10::make_intrusive<T>(std::forward<Args>(args)...));
}