Pytorch转TensorRT 在Linux下有相当多的解决方案

https://github.com/NVIDIA-AI-IOT/torch2trt

如上就是一个广范运用的方案

然而NVIDIA在Windows的TensorRT只提供Zip包, 无法提供python接口

真是叫人窒息

那就只能自己造轮子了

安利一波自己的轮子

https://github.com/leng-yue/ONNX-TensorRT-DLL

将 PyTorch 模型转为 ONNX

这一步比较简单

import torch
from torchvision.models import densenet169

dummy_input = torch.randn(10, 3, 224, 224, device='cuda')
model = densenet169(pretrained=True).cuda()

input_names = ["features"]
output_names = ["classifier"]

output = torch.onnx.export(model, dummy_input, "./onnx/dense.onnx", verbose=True, input_names=input_names, output_names=output_names)

如果没有出问题的话 你将可以在onnx目录下找到dense.onnx

将ONNX转为TRT序列化

其实直接加载ONNX并运行也可以 就是慢…

在此提供自己写的工具

extern "C" __declspec(dllexport) int ONNX2TRT(char* onnxFileName, char* trtFileName, int batchSize) 
{
	if (_access(onnxFileName, 02) != 0)
	{
		// 文件无法读取
		Debug("Can't Read ONNX File");
		return -1;
	}

	// create the builder
	IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
	if (builder == nullptr) 
	{
		Debug("Create Builder Failure");
		return -2;
	}

	// Now We Have BatchSize Here
	nvinfer1::INetworkDefinition* network = builder->createNetworkV2(batchSize);

	nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();

	auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());

	if (!parser->parseFromFile(onnxFileName, static_cast<int>(gLogger.getReportableSeverity())))
	{
		Debug("Parse ONNX Failure");
		return -3;
	}

	builder->setMaxBatchSize(batchSize);
	builder->setMaxWorkspaceSize(1_GiB);
	config->setMaxWorkspaceSize(1_GiB);
	if (gArgs.runInFp16)
	{
		config->setFlag(BuilderFlag::kFP16);
	}
	if (gArgs.runInInt8)
	{
		config->setFlag(BuilderFlag::kINT8);
		samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
	}

	samplesCommon::enableDLA(builder, config, gArgs.useDLACore);

	ICudaEngine* engine = builder->buildCudaEngine(*network);
	if (!engine)
	{
		Debug("Engine Build Failure");
		return -4;
	}

	// we can destroy the parser
	parser->destroy();

	// serialize the engine, then close everything down
	IHostMemory* trtModelStream = engine->serialize();

	engine->destroy();
	network->destroy();
	builder->destroy();

	if (!trtModelStream)
	{
		Debug("Serialize Fail");
		return -5;
	}

	ofstream ofs(trtFileName, std::ios::out | std::ios::binary);
	ofs.write((char*)(trtModelStream->data()), trtModelStream->size());
	ofs.close();
	trtModelStream->destroy();

	Debug("Save Success");

	return 0;
}

点下按钮等待即可 大概需要几分钟的样子

Error CodeDescription
-1Can’t Read ONNX File
-2Create Builder Failure
-3Parse ONNX Failure
-4Engine Build Failure
-5Serialize Fail

序列化TRT调用

extern "C" __declspec(dllexport) void* LoadNet(char* trtFileName)
{
	if (_access(trtFileName, 02) != 0)
	{
		Debug("Can't Read TRT File");
		return 0;
	}

	std::ifstream t(trtFileName, std::ios::in | std::ios::binary);
	std::stringstream tempStream;
	tempStream << t.rdbuf();
	t.close();
	Debug("TRT File Loaded");

	tempStream.seekg(0, std::ios::end);
	const int modelSize = tempStream.tellg();
	tempStream.seekg(0, std::ios::beg);
	void* modelMem = malloc(modelSize);
	tempStream.read((char*)modelMem, modelSize);

	IRuntime* runtime = createInferRuntime(gLogger);
	if (runtime == nullptr)
	{
		Debug("Build Runtime Failure");
		return 0;
	}

	if (gArgs.useDLACore >= 0)
	{
		runtime->setDLACore(gArgs.useDLACore);
	}

	ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, nullptr);

	if (engine == nullptr)
	{
		Debug("Build Engine Failure");
		return 0;
	}

	IExecutionContext* context = engine->createExecutionContext();
	if (context == nullptr)
	{
		Debug("Build Context Failure");
		return 0;
	}

	TensorRT* trt = new TensorRT();
	trt->context = context;
	trt->engine = engine;
	trt->runtime = runtime;

	return trt;
}

extern "C" __declspec(dllexport) void ReleaseNet(void* trt) 
{
	TensorRT* curr = (TensorRT*)trt;
	curr->context->destroy();
	curr->engine->destroy();
	curr->runtime->destroy();

	delete curr;
	curr = NULL;
	delete curr;
}

extern "C" __declspec(dllexport) void DoInference(void* trt, char* input_name, char* output_name, float* input, float* output, int input_size, int output_size)
{
	TensorRT* curr = (TensorRT*)trt;

	const ICudaEngine& engine = curr->context->getEngine();
	// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
	// of these, but in this case we know that there is exactly one input and one output.
	assert(engine.getNbBindings() == 2);
	void* buffers[2];

	// In order to bind the buffers, we need to know the names of the input and output tensors.
	// note that indices are guaranteed to be less than IEngine::getNbBindings()

	const int inputIndex = engine.getBindingIndex(input_name);
	const int outputIndex = engine.getBindingIndex(output_name);

	// DebugP(inputIndex); DebugP(outputIndex);
	// create GPU buffers and a stream

	CHECK(cudaMalloc(&buffers[inputIndex], input_size * sizeof(float)));
	CHECK(cudaMalloc(&buffers[outputIndex], output_size * sizeof(float)));

	cudaStream_t stream;
	CHECK(cudaStreamCreate(&stream));

	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
	CHECK(cudaMemcpyAsync(buffers[inputIndex], input, input_size * sizeof(float), cudaMemcpyHostToDevice, stream));
	// Because we had specified batch size, so we use enqueueV2
	curr->context->enqueueV2(buffers, stream, nullptr);
	CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
	cudaStreamSynchronize(stream);

	// release the stream and the buffers
	cudaStreamDestroy(stream);
	CHECK(cudaFree(buffers[inputIndex]));
	CHECK(cudaFree(buffers[outputIndex]));
}

C# 调用

[DllImport("TensorRT.dll")]
public static extern int ONNX2TRT(string onnxFileName, string trtFileName, int batchSize);

[DllImport("TensorRT.dll")]
public static extern IntPtr LoadNet(string trtFileName);

[DllImport("TensorRT.dll")]
public static extern void ReleaseNet(IntPtr ptr);

[DllImport("TensorRT.dll")]
public static extern void DoInference(IntPtr ptr, string input_name, string output_name, float[] input, float[] output, int input_size, int output_size);

性能实测

以下为 Densenet169 修改输入至 128*128 在 RTX2070 下的运行速度

Github

https://github.com/leng-yue/ONNX-TensorRT-DLL

参考

https://pytorch.org/docs/stable/onnx.html