OpenCV C++的网络实时视频流传输——基于Yolov5 face与TCP实现实时推流的深度学习图像处理客户端与服务器端

前言

在Windows下使用TCP协议，基于OpenCV C++与Yolov5实现了一个完整的实时推流的深度学习图像处理客户端与服务器端，为了达到实时传输的效果，客户端使用了多线程的方式实现。深度学习模型是基于onnxruntime的GPU推理。，实现效果如下：

基于OpenCV C++的网络实时视频流传输

1. 服务器端

1. 1服务器类的实现

首先定义一个服务器类，类实现，接收发送过来的图像并进行解码，并接收客户端检测到的人脸关键点，并画到最终要显示的图像上：

#include "Server.h"
#define SIZE 100Server::Server()
{
}Server::~Server()
{
}bool Server::initialization(const int& port, const cv::VideoCapture& cap) 
{m_port = port;m_cap = cap;// 初始化库（windows独有）//初始化套接字库WORD w_req = MAKEWORD(2, 2);//版本号WSADATA wsadata;int err;err = WSAStartup(w_req, &wsadata);if (err != 0) {std::cout << "初始化套接字库失败！" << std::endl;return -1;}else {std::cout << "初始化套接字库成功！" << std::endl;}//检测版本号if (LOBYTE(wsadata.wVersion) != 2 || HIBYTE(wsadata.wHighVersion) != 2) {std::cout << "套接字库版本号不符！" << std::endl;WSACleanup();return false;}else {std::cout << "套接字库版本正确！" << std::endl;}return true;
}bool Server::initialization(const int& port) 
{m_port = port;// 初始化库（windows独有）//初始化套接字库WORD w_req = MAKEWORD(2, 2);//版本号WSADATA wsadata;int err;err = WSAStartup(w_req, &wsadata);if (err != 0) {std::cout << "初始化套接字库失败！" << std::endl;return false;}else {std::cout << "初始化套接字库成功！" << std::endl;}//检测版本号if (LOBYTE(wsadata.wVersion) != 2 || HIBYTE(wsadata.wHighVersion) != 2) {std::cout << "套接字库版本号不符！" << std::endl;WSACleanup();return false;}else {std::cout << "套接字库版本正确！" << std::endl;}return true;
}bool Server::build_connect() 
{//服务端地址客户端地址SOCKADDR_IN server_addr;SOCKADDR_IN accept_addr;//填充服务端信息server_addr.sin_family = AF_INET;  // 用来定义那种地址族，AF_INET：IPV4server_addr.sin_addr.S_un.S_addr = htonl(INADDR_ANY);  // 保存ip地址，htonl将一个无符号长整型转换为TCP/IP协议网络的大端// INADDR_ANY表示一个服务器上的所有网卡server_addr.sin_port = htons(m_port);  // 端口号//创建套接字m_server = socket(AF_INET, SOCK_STREAM, 0);  // 使用tcp进行连接if (::bind(m_server, (SOCKADDR*)&server_addr, sizeof(SOCKADDR)) == SOCKET_ERROR) {std::cout << "套接字绑定失败！" << std::endl;WSACleanup();return false;}else {std::cout << "套接字绑定成功！" << std::endl;}//设置套接字为监听状态if (listen(m_server, SOMAXCONN) < 0) {std::cout << "设置监听状态失败！" << std::endl;WSACleanup();return false;}else {std::cout << "设置监听状态成功！" << std::endl;}std::cout << "服务端正在监听连接，请稍候...." << std::endl;//接受连接请求int len = sizeof(SOCKADDR);m_accept = accept(m_server, (SOCKADDR*)&accept_addr, &len);if (m_accept == SOCKET_ERROR) {std::cout << "连接失败！" << std::endl;WSACleanup();return false;}std::cout << "连接建立，准备接受数据" << std::endl;return true;
}bool Server::send_data() 
{cv::Mat frame;std::vector<uchar> data_encode;std::vector<int> params;  // 压缩参数params.resize(3, 0);params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩params[1] = 30;char frames_cnt[10] = { 0, };_itoa_s(int(m_cap.get(cv::CAP_PROP_FRAME_COUNT)), frames_cnt, 10);send(m_accept, frames_cnt, 10, 0);std::cout << "开始发送" << std::endl;int j = 0;while (m_cap.read(frame)) {m_file_in.push_back(frame.clone());imencode(".jpg", frame, data_encode, params);  // 对图像进行压缩int len_encoder = data_encode.size();_itoa_s(len_encoder, frames_cnt, 10);send(m_accept, frames_cnt, 10, 0);_itoa_s(SIZE, frames_cnt, 10);send(m_accept, frames_cnt, 10, 0);// 发送char send_char[SIZE] = { 0, };int index = 0;bool flag = false;for (int i = 0; i < len_encoder / SIZE + 1; ++i) {for (int k = 0; k < SIZE; ++k) {if (index >= data_encode.size()) {flag = true;break;}send_char[k] = data_encode[index++];}send(m_accept, send_char, SIZE, 0);}data_encode.clear();++j;std::cout << j << std::endl;  // 发送端一直在发送}std::cout << "发送完成";return true;
}std::vector<std::string> split_string(const char str[], char delimiter) 
{std::vector<std::string> tokens;std::istringstream stream(str);std::string token;// 使用分隔符分割字符串while (std::getline(stream, token, delimiter)) {tokens.push_back(token);}return tokens;
}float str_to_float(const std::string& str) 
{// 将std::string转换为C风格字符串const char* c_str = str.c_str();// 使用std::strtof转换字符串为floatchar* endptr;float result = std::strtof(c_str, &endptr);// 检查转换是否成功if (endptr == c_str) {std::cerr << "StringToFloat failed to convert string to float." << std::endl;}return result;
}int str_to_point2f(std::vector<std::string> str, std::vector<cv::Point2f> &point2f)
{point2f.push_back(cv::Point2f(str_to_float(str[1]), str_to_float(str[2])));point2f.push_back(cv::Point2f(str_to_float(str[3]), str_to_float(str[4])));point2f.push_back(cv::Point2f(str_to_float(str[5]), str_to_float(str[6])));point2f.push_back(cv::Point2f(str_to_float(str[7]), str_to_float(str[8])));point2f.push_back(cv::Point2f(str_to_float(str[9]), str_to_float(str[10])));cv::Point2f p2f(0, 0);int index = 0;for (const cv::Point2f& p : point2f){if (p == p2f){index++;}}return index;
}bool Server::receive_data()
{cv::Mat cv_frame;std::vector<uchar> data_decode;std::vector<int> params;  // 压缩参数params.resize(3, 0);params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩params[1] = image_quality;cv::namedWindow("ServerData", cv::WINDOW_NORMAL);char frams_cnt[100] = { 0, };// 解析总帧数int count = atoi(frams_cnt);//std::cout << count << std::endl;int idx = 0;while (1) {// 解析图片字节长度int irecv = recv(m_accept, frams_cnt, 100, 0);char s = '#';std::vector<std::string> strs = split_string(frams_cnt, s);if (strs.size() != 11){break;}int cnt = std::stoi(strs[0]);std::vector<cv::Point2f> face_points;int point_index = str_to_point2f(strs,face_points);data_decode.resize(cnt);//将队列大小重置为图片字节长度int index = 0;//表示接收数据长度计量count = cnt;//表示的是要从接收缓冲区接收字节的数量char *recv_char = new char[cnt];//新建一个字节数组 数组长度为图片字节长度while (count > 0)//这里只能写count > 0 如果写count >= 0 那么while循环会陷入一个死循环{//在网络通信中  recv 函数一次性接收到的字节数可能小于等于设定的SIZE大小，这时可能需要多次recvint iRet = recv(m_accept, recv_char, count, 0);int tmp = 0;for (int k = 0; k < iRet; k++){tmp = k+1;index++;if (index >= cnt) { break; }}memcpy(&data_decode[index - tmp ], recv_char , tmp);if (!iRet) { return -1; }count -= iRet;//更新余下需要从接收缓冲区接收的字节数量}delete[]recv_char;try {cv_frame = cv::imdecode(data_decode, cv::IMREAD_COLOR);if (!cv_frame.empty() && point_index < 2){for (int i = 0; i < face_points.size(); ++i){cv::circle(cv_frame, face_points[i], 4, cv::Scalar(0, 0, 255), -1, cv::LINE_AA);}cv::namedWindow("ServerData", 0);cv::imshow("ServerData", cv_frame);cv::waitKey(1);data_decode.clear();}else{data_decode.clear();continue;}}catch (const char *msg){data_decode.clear();continue;}}std::cout << "接受完成";return true;
}bool Server::send_data_frame(cv::Mat input)
{cv::Mat frame = input;std::vector<uchar> data_encode;std::vector<int> params;  // 压缩参数params.resize(3, 0);params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩params[1] = 100;char frames_cnt[10] = { 0, };std::cout << "开始发送" << std::endl;m_file_in.push_back(frame.clone());imencode(".jpg", frame, data_encode, params);  // 对图像进行压缩int len_encoder = data_encode.size();_itoa_s(len_encoder, frames_cnt, 10);send(m_accept, frames_cnt, 10, 0);_itoa_s(SIZE, frames_cnt, 10);send(m_accept, frames_cnt, 10, 0);// 发送char send_char[SIZE] = { 0, };int index = 0;bool flag = false;for (int i = 0; i < len_encoder / SIZE + 1; ++i) {for (int k = 0; k < SIZE; ++k) {if (index >= data_encode.size()) {flag = true;break;}send_char[k] = data_encode[index++];}send(m_accept, send_char, SIZE, 0);}data_encode.clear();std::cout << "发送完成";return true;
}bool Server::receive_data_frame(cv::Mat& output) 
{cv::Mat frame;std::vector<uchar> data_decode;std::vector<int> params;  // 压缩参数params.resize(3, 0);params[0] = cv::IMWRITE_JPEG_QUALITY; // 无损压缩params[1] = 100;char frams_cnt[10] = { 0, };recv(m_accept, frams_cnt, 10, 0);// 解析总帧数int cnt = atoi(frams_cnt);std::cout << "frams_cnt " << frams_cnt <<" "<< cnt<< std::endl;recv(m_accept, frams_cnt, 10, 0);int size = atoi(frams_cnt);std::cout << "size " << size << std::endl;data_decode.resize(cnt);int index = 0;bool flag = true;char *recv_b = new char[cnt];std::cout << " cnt= " << cnt << std::endl;int iRecv = recv(m_accept, recv_b, cnt, 0);for (int i = 0; i < cnt; i++){data_decode[index++] = recv_b[i];}std::cout << "data_decode" << data_decode.size() << std::endl;output = cv::imdecode(data_decode, cv::IMREAD_COLOR);//output = imdecode(data_decode, CV_LOAD_IMAGE_COLOR);std::cout << "  output.size " << output.size().width << "  "<< output.size().height  << std::endl;delete[]recv_b;data_decode.clear();std::cout << "接受完成";return true;
}bool Server::free_connect() 
{m_cap.release();//关闭套接字closesocket(m_server);closesocket(m_accept);//释放DLL资源WSACleanup();return true;
}

1.2 类调用

在main函数进行调用，这里使用8080这个端口，端口可以按自己的要求更改，只要不占用就行：

#include "server.h"int recv_online_video(int port = 8080)
{Server ser;ser.initialization(port);ser.build_connect();ser.receive_data();ser.free_connect();return 0;
}int main() 
{recv_online_video(8080);return 0;
}

2. 客户端

2.1 客户端类实现

客户端实现了对视频里面的图像进行人脸检测，人脸检测使用的yolov5s face，并把检测到人脸的关键点打包发送给服务器，然后实现数据序列化发送给服务器：

#include "CameraClient.h"CameraClient::CameraClient(std::string detect_model_path)
{yolov5_face = new YOLOV5Face();yolov5_face->set_gpu(0, 4);yolov5_face->set_num_thread(4);yolov5_face->read_model(detect_model_path);
}CameraClient::~CameraClient()
{}std::string float_to_string(float value, int precision) 
{std::stringstream ss;ss << std::fixed << std::setprecision(precision) << value;return ss.str();
}std::string point_to_str(cv::Point2f point, int precision)
{return  ("#" + float_to_string(point.x, 3) + "#" + float_to_string(point.y, 3));
}std::string points_to_str(std::vector<cv::Point2f> points, int precision)
{std::string strs;for (int i = 0; i < points.size(); ++i){std::string s = point_to_str(points[i], precision);strs += s;}return strs;
}void CameraClient::get_camera_frame(std::string camera_path,int width,int height,int fps,bool is_show)
{bool found = false;cv::Size size(width, height);for (const cv::Size& size : this->camera_size){if (size == size){found = true;break;}}if (!found){std::cerr << " Camera resolution setting error." << std::endl;return;}int mask = 0;if (size == cv::Size(540, 960)){size = cv::Size(720, 1280);mask = 1;}/*int index = camera_index < 0 ? 0 : camera_index;cv::VideoCapture cap(index);*/cv::VideoCapture cap(camera_path);if (!cap.isOpened()){std::cerr << " Camera is no opened." << std::endl;return;}cap.set(cv::CAP_PROP_FRAME_WIDTH, size.width);//宽度 cap.set(cv::CAP_PROP_FRAME_HEIGHT, size.height);cap.set(cv::CAP_PROP_FPS, fps);//保存抽帧的图像矩阵cv::Mat cv_frame;while (true){cap >> cv_frame;if (cv_frame.empty()){break;}if (mask == 1){cv::resize(cv_frame, cv_frame, cv::Size(540, 960));}mutex_lock.lock();//保证队列里只有五帧，以保证内存的开销if (mat_info_input.size() > max_frame){mat_info_input.pop();}else{std::vector<FaceInfo> src_face_infos;MatInfo mat_info;if (yolov5_face->detect(cv_frame, src_face_infos)){mat_info.points = src_face_infos[0].points;mat_info.cv_mat = cv_frame.clone();}else{std::vector<cv::Point2f> p(5, cv::Point2f(0.0f, 0.0f));mat_info.cv_mat = cv_frame.clone();mat_info.points = p;}mat_info_input.push(mat_info);}if (is_show){cv::namedWindow("cam src", 0);cv::imshow("cam src", cv_frame);if (cv::waitKey(1) == 27){break;}}mutex_lock.unlock();}
}int CameraClient::send_camera_frame(std::string server_ip, int port)
{// 设置Winsock版本号WORD w_req = MAKEWORD(2, 2);// WSADATA 结构体用于存储初始化信息WSADATA wsadata;int err;// 初始化Winsock套接字库err = WSAStartup(w_req, &wsadata);if (err != 0){// 初始化失败，打印错误信息std::cout << "初始化套接字库失败！" << std::endl;return -1;}else{// 初始化成功，打印信息std::cout << "初始化套接字库成功！" << std::endl;}// 检测Winsock版本号是否符合要求if (LOBYTE(wsadata.wVersion) != 2 || HIBYTE(wsadata.wHighVersion) != 2) {std::cout << "套接字库版本号不符！" << std::endl;WSACleanup(); // 清理Winsockreturn -1;}else {std::cout << "套接字库版本正确！" << std::endl;}// 定义服务端地址结构体SOCKADDR_IN server_addr;// 填充服务器地址信息,地址族为IPv4server_addr.sin_family = AF_INET;  // 将字符串形式的IP地址转换为二进制形式struct in_addr addr;int result = inet_pton(AF_INET, server_ip.c_str(), &addr);if (result <= 0){if (result == 0){std::cerr << "inet_pton failed: address is invalid." << std::endl;}else{std::cerr << "inet_pton failed: error occurred." << std::endl;}return -1;}server_addr.sin_addr = addr;// 设置端口号并转换为网络字节序server_addr.sin_port = htons(port);// 创建套接字SOCKET m_server = socket(AF_INET, SOCK_STREAM, 0);if (connect(m_server, (SOCKADDR*)&server_addr, sizeof(SOCKADDR)) == SOCKET_ERROR){// 连接服务器失败，打印错误信息std::cout << "服务器连接失败！" << std::endl;WSACleanup();return -1;}else{// 连接服务器成功，打印信息std::cout << "服务器连接成功！" << std::endl;}// 声明用于读取视频帧的Mat对象cv::Mat cv_frame;std::vector<cv::Point2f> face_points;// 声明用于保存JPEG编码后数据的向量std::vector<uchar> data_encode;// 声明JPEG编码参数std::vector<int> params;params.resize(4, 0);// 设置压缩类型为JPEGparams[0] = cv::IMWRITE_JPEG_QUALITY; //设置JPEG压缩质量为99（最佳质量）params[1] = image_quality; //用于记录发送帧数量的缓冲区char frames_cnt[100] = { 0, };std::cout << "开始发送" << std::endl;int j = 0;//打开视频文件//cv::VideoCapture cap(video_path);while (true){//锁定互斥锁以安全访问queueInput队列mutex_lock.lock();if (mat_info_input.empty()) // 如果队列为空，则等待{mutex_lock.unlock(); // 释放锁Sleep(3); // 短暂休眠continue; // 继续下一次循环}else{cv_frame = mat_info_input.front().cv_mat; // 从队列中取出帧face_points = mat_info_input.front().points;mat_info_input.pop();mutex_lock.unlock(); // 释放锁Sleep(1); // 短暂休眠}// 对帧进行JPEG压缩imencode(".jpg", cv_frame, data_encode, params);std::string str = points_to_str(face_points, 3);// 获取压缩后数据的大小int len_encoder = data_encode.size();char len[10] = {0,};// 将数据大小转换为字符串并发送_itoa_s(len_encoder, len, 10);// 复制第一个数组到结果数组strcpy(frames_cnt, len);// 将第二个数组追加到结果数组的末尾strcat(frames_cnt, str.c_str());std::cout << str.size() << " len_encoder = " << frames_cnt << std::endl;send(m_server, frames_cnt, 100, 0);// 发送压缩后的图像数据char* send_b = new char[data_encode.size()];memcpy(send_b, &data_encode[0], data_encode.size());int iSend = send(m_server, send_b, data_encode.size(), 0);delete[]send_b;// 清空编码数据向量，为下一次编码做准备data_encode.clear();++j;}std::cout << "发送完成";closesocket(m_server); // 关闭套接字WSACleanup(); // 清理Winsock
}

2.2 类调用

在main里面使用多线程进行调用：

#include "CameraClient.h"
#include <thread>int main()
{std::string video_path = "A.mp4";std::string server_ip = "192.168.xx.xxx";int port = 8080;std::string detect_model_path = "models/yolov5face-s-640x640.onnx";CameraClient video_client(detect_model_path);int w = 720;int h = 1280;std::thread Get(&CameraClient::get_camera_frame, &video_client, video_path, w,h, 25, true);std::thread Send(&CameraClient::send_camera_frame, &video_client, server_ip, port);Get.join();Send.join();return 0;
}

2.3 服务器IP地址

在Windows系统中获取IP地址可以通过多种方法实现，包括使用命令行工具、编程接口或图形用户界面。以下是一些常用的方法：

使用命令行工具

在命令提示符（cmd）或PowerShell中，可以使用ipconfig命令来查看IP地址：

ipconfig

在这里插入图片描述

这将列出所有网络适配器的详细信息，包括IPv4和IPv6地址。如果你只想获取特定的适配器的IP地址，可以使用：

ipconfig | findstr "适配器名称"

替换“适配器名称”为你想要查询的网络适配器的名称。

使用Windows API

如果你在编程中需要获取Windows系统的IP地址，可以使用Windows API函数。例如，使用以下C++代码：

#include <winsock2.h>
#include <iphlpapi.h>
#include <iostream>#pragma comment(lib, "iphlpapi.lib")
#pragma comment(lib, "ws2_32.lib")int main() {WSADATA d;if (WSAStartup(MAKEWORD(2, 2), &d) != 0) {return -1;}ULONG outBufLen = sizeof(IP_ADAPTER_INFO);PIP_ADAPTER_INFO pAdapterInfo = (IP_ADAPTER_INFO *)malloc(outBufLen);DWORD dwRetVal = GetAdaptersInfo(pAdapterInfo, &outBufLen);if (dwRetVal == ERROR_BUFFER_OVERFLOW) {free(pAdapterInfo);pAdapterInfo = (IP_ADAPTER_INFO *)malloc(outBufLen);dwRetVal = GetAdaptersInfo(pAdapterInfo, &outBufLen);}if (dwRetVal == NO_ERROR) {for (PIP_ADAPTER_INFO pAdapter = pAdapterInfo; pAdapter != NULL; pAdapter = pAdapter->Next) {std::cout << "适配器名称: " << pAdapter->AdapterName << std::endl;std::cout << "IPv4 地址: " << pAdapter->IpAddressList.IpAddress.String << std::endl;// 可以继续遍历其他IP地址或适配器}}free(pAdapterInfo);WSACleanup();return 0;
}

3.人脸检测

YOLOv5 是一个流行的实时对象检测系统，它使用单个卷积神经网络同时预测对象的类别和位置。虽然YOLOv5主要用于对象检测任务，但也可以被训练或微调以检测特定类型的物体，比如人脸。

#include "YOLOV5Face.h"YOLOV5Face::YOLOV5Face(std::string& onnx_path, unsigned int _num_threads)
{std::wstring widestr = std::wstring(onnx_path.begin(), onnx_path.end());Ort::Env env = Ort::Env(ORT_LOGGING_LEVEL_ERROR, "YOLOV5Face");Ort::SessionOptions session_options;session_options.SetIntraOpNumThreads(4);OrtStatus* status = OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0);session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);ort_session = new Ort::Session(env, widestr.c_str(), session_options);session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);Ort::AllocatorWithDefaultOptions allocator;input_name = ort_session->GetInputName(0, allocator);input_node_names.resize(1);input_node_names[0] = input_name;Ort::TypeInfo type_info = ort_session->GetInputTypeInfo(0);auto tensor_info = type_info.GetTensorTypeAndShapeInfo();input_tensor_size = 1;input_node_dims = tensor_info.GetShape();for (unsigned int i = 0; i < input_node_dims.size(); ++i)input_tensor_size *= input_node_dims.at(i);input_values_handler.resize(input_tensor_size);// 4. output names & output dimmsnum_outputs = ort_session->GetOutputCount();output_node_names.resize(num_outputs);for (unsigned int i = 0; i < num_outputs; ++i){output_node_names[i] = ort_session->GetOutputName(i, allocator);Ort::TypeInfo output_type_info = ort_session->GetOutputTypeInfo(i);auto output_tensor_info = output_type_info.GetTensorTypeAndShapeInfo();auto output_dims = output_tensor_info.GetShape();output_node_dims.push_back(output_dims);}
}YOLOV5Face::YOLOV5Face()
{}void YOLOV5Face::set_gpu(int gpu_index, int gpu_ram)
{std::vector<std::string> available_providers = Ort::GetAvailableProviders();auto cuda_available = std::find(available_providers.begin(),available_providers.end(), "CUDAExecutionProvider");if (gpu_index >= 0 && (cuda_available != available_providers.end())){OrtCUDAProviderOptions cuda_options;cuda_options.device_id = gpu_index;cuda_options.arena_extend_strategy = 0;if (gpu_ram == -1){cuda_options.gpu_mem_limit = ~0ULL;}else{cuda_options.gpu_mem_limit = size_t(gpu_ram * 1024 * 1024 * 1024);}cuda_options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearch::OrtCudnnConvAlgoSearchExhaustive;cuda_options.do_copy_in_default_stream = 1;session_options.AppendExecutionProvider_CUDA(cuda_options);}}void YOLOV5Face::set_num_thread(int num_thread)
{_num_thread = num_thread;session_options.SetInterOpNumThreads(num_thread);session_options.SetIntraOpNumThreads(num_thread);session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
}bool YOLOV5Face::read_model(const std::string model_path)
{try{std::wstring widestr = std::wstring(model_path.begin(), model_path.end());ort_session = new Ort::Session(env, widestr.c_str(), session_options);Ort::AllocatorWithDefaultOptions allocator;input_name = ort_session->GetInputName(0, allocator);input_node_names.resize(1);input_node_names[0] = input_name;Ort::TypeInfo type_info = ort_session->GetInputTypeInfo(0);auto tensor_info = type_info.GetTensorTypeAndShapeInfo();input_tensor_size = 1;input_node_dims = tensor_info.GetShape();for (unsigned int i = 0; i < input_node_dims.size(); ++i)input_tensor_size *= input_node_dims.at(i);input_values_handler.resize(input_tensor_size);// 4. output names & output dimmsnum_outputs = ort_session->GetOutputCount();output_node_names.resize(num_outputs);for (unsigned int i = 0; i < num_outputs; ++i){output_node_names[i] = ort_session->GetOutputName(i, allocator);Ort::TypeInfo output_type_info = ort_session->GetOutputTypeInfo(i);auto output_tensor_info = output_type_info.GetTensorTypeAndShapeInfo();auto output_dims = output_tensor_info.GetShape();output_node_dims.push_back(output_dims);}return true;}catch (const std::exception&){return false;}return true;
}cv::Mat normalize(const cv::Mat& mat, float mean, float scale)
{cv::Mat matf;if (mat.type() != CV_32FC3) mat.convertTo(matf, CV_32FC3);else matf = mat; // referencereturn (matf - mean) * scale;
}void normalize(const cv::Mat& inmat, cv::Mat& outmat,float mean, float scale)
{outmat = normalize(inmat, mean, scale);
}void normalize_inplace(cv::Mat& mat_inplace, float mean, float scale)
{if (mat_inplace.type() != CV_32FC3) mat_inplace.convertTo(mat_inplace, CV_32FC3);normalize(mat_inplace, mat_inplace, mean, scale);
}Ort::Value create_tensor(const cv::Mat& mat,const std::vector<int64_t>& tensor_dims,const Ort::MemoryInfo& memory_info_handler,std::vector<float>& tensor_value_handler,unsigned int data_format)throw(std::runtime_error)
{const unsigned int rows = mat.rows;const unsigned int cols = mat.cols;const unsigned int channels = mat.channels();cv::Mat mat_ref;if (mat.type() != CV_32FC(channels)) mat.convertTo(mat_ref, CV_32FC(channels));else mat_ref = mat; // reference only. zero-time cost. support 1/2/3/... channelsif (tensor_dims.size() != 4) throw std::runtime_error("dims mismatch.");if (tensor_dims.at(0) != 1) throw std::runtime_error("batch != 1");// CXHXWif (data_format == CHW){const unsigned int target_height = tensor_dims.at(2);const unsigned int target_width = tensor_dims.at(3);const unsigned int target_channel = tensor_dims.at(1);const unsigned int target_tensor_size = target_channel * target_height * target_width;if (target_channel != channels) throw std::runtime_error("channel mismatch.");tensor_value_handler.resize(target_tensor_size);cv::Mat resize_mat_ref;if (target_height != rows || target_width != cols)cv::resize(mat_ref, resize_mat_ref, cv::Size(target_width, target_height));else resize_mat_ref = mat_ref; // reference only. zero-time cost.std::vector<cv::Mat> mat_channels;cv::split(resize_mat_ref, mat_channels);// CXHXWfor (unsigned int i = 0; i < channels; ++i)std::memcpy(tensor_value_handler.data() + i * (target_height * target_width),mat_channels.at(i).data, target_height * target_width * sizeof(float));return Ort::Value::CreateTensor<float>(memory_info_handler, tensor_value_handler.data(),target_tensor_size, tensor_dims.data(),tensor_dims.size());}// HXWXCconst unsigned int target_height = tensor_dims.at(1);const unsigned int target_width = tensor_dims.at(2);const unsigned int target_channel = tensor_dims.at(3);const unsigned int target_tensor_size = target_channel * target_height * target_width;if (target_channel != channels) throw std::runtime_error("channel mismatch!");tensor_value_handler.resize(target_tensor_size);cv::Mat resize_mat_ref;if (target_height != rows || target_width != cols)cv::resize(mat_ref, resize_mat_ref, cv::Size(target_width, target_height));else resize_mat_ref = mat_ref; // reference only. zero-time cost.std::memcpy(tensor_value_handler.data(), resize_mat_ref.data, target_tensor_size * sizeof(float));return Ort::Value::CreateTensor<float>(memory_info_handler, tensor_value_handler.data(),target_tensor_size, tensor_dims.data(),tensor_dims.size());
}Ort::Value YOLOV5Face::transform(const cv::Mat& mat_rs)
{cv::Mat canvas;cv::cvtColor(mat_rs, canvas, cv::COLOR_BGR2RGB);normalize_inplace(canvas, mean_val, scale_val); // float32return create_tensor(canvas, input_node_dims, memory_info_handler,input_values_handler, CHW);
}void YOLOV5Face::resize_unscale(const cv::Mat& mat, cv::Mat& mat_rs,int target_height, int target_width,ScaleParams& scale_params)
{if (mat.empty()) return;int img_height = static_cast<int>(mat.rows);int img_width = static_cast<int>(mat.cols);mat_rs = cv::Mat(target_height, target_width, CV_8UC3,cv::Scalar(0, 0, 0));// scale ratio (new / old) new_shape(h,w)float w_r = (float)target_width / (float)img_width;float h_r = (float)target_height / (float)img_height;float r = std::min(w_r, h_r);// compute paddingint new_unpad_w = static_cast<int>((float)img_width * r); // floorint new_unpad_h = static_cast<int>((float)img_height * r); // floorint pad_w = target_width - new_unpad_w; // >=0int pad_h = target_height - new_unpad_h; // >=0int dw = pad_w / 2;int dh = pad_h / 2;// resize with unscalingcv::Mat new_unpad_mat;// cv::Mat new_unpad_mat = mat.clone(); // may not need clone.cv::resize(mat, new_unpad_mat, cv::Size(new_unpad_w, new_unpad_h));new_unpad_mat.copyTo(mat_rs(cv::Rect(dw, dh, new_unpad_w, new_unpad_h)));// record scale params.scale_params.ratio = r;scale_params.dw = dw;scale_params.dh = dh;scale_params.flag = true;
}void YOLOV5Face::generate_bboxes_kps(const ScaleParams& scale_params,std::vector<lite::types::BoxfWithLandmarks>& bbox_kps_collection,std::vector<Ort::Value>& output_tensors, float score_threshold,float img_height, float img_width)
{Ort::Value& output = output_tensors.at(0); // (1,n,16=4+1+10+1)auto output_dims = output_node_dims.at(0); // (1,n,16)const unsigned int num_anchors = output_dims.at(1); // n = ?const float* output_ptr = output.GetTensorMutableData<float>();float r_ = scale_params.ratio;int dw_ = scale_params.dw;int dh_ = scale_params.dh;bbox_kps_collection.clear();unsigned int count = 0;for (unsigned int i = 0; i < num_anchors; ++i){const float* row_ptr = output_ptr + i * 16;float obj_conf = row_ptr[4];if (obj_conf < score_threshold) continue; // filter first.float cls_conf = row_ptr[15];if (cls_conf < score_threshold) continue; // face score.// bounding boxconst float* offsets = row_ptr;float cx = offsets[0];float cy = offsets[1];float w = offsets[2];float h = offsets[3];lite::types::BoxfWithLandmarks box_kps;float x1 = ((cx - w / 2.f) - (float)dw_) / r_;float y1 = ((cy - h / 2.f) - (float)dh_) / r_;float x2 = ((cx + w / 2.f) - (float)dw_) / r_;float y2 = ((cy + h / 2.f) - (float)dh_) / r_;box_kps.box.x1 = std::max(0.f, x1);box_kps.box.y1 = std::max(0.f, y1);box_kps.box.x2 = std::min(img_width - 1.f, x2);box_kps.box.y2 = std::min(img_height - 1.f, y2);box_kps.box.score = cls_conf;box_kps.box.label = 1;box_kps.box.label_text = "face";box_kps.box.flag = true;// landmarksconst float* kps_offsets = row_ptr + 5;for (unsigned int j = 0; j < 10; j += 2){cv::Point2f kps;float kps_x = (kps_offsets[j] - (float)dw_) / r_;float kps_y = (kps_offsets[j + 1] - (float)dh_) / r_;kps.x = std::min(std::max(0.f, kps_x), img_width - 1.f);kps.y = std::min(std::max(0.f, kps_y), img_height - 1.f);box_kps.landmarks.points.push_back(kps);}box_kps.landmarks.flag = true;box_kps.flag = true;bbox_kps_collection.push_back(box_kps);count += 1; // limit boxes for nms.if (count > max_nms)break;}
}void YOLOV5Face::nms_bboxes_kps(std::vector<lite::types::BoxfWithLandmarks>& input,std::vector<lite::types::BoxfWithLandmarks>& output,float iou_threshold, unsigned int topk)
{if (input.empty()) return;std::sort(input.begin(), input.end(),[](const lite::types::BoxfWithLandmarks& a, const lite::types::BoxfWithLandmarks& b){ return a.box.score > b.box.score; });const unsigned int box_num = input.size();std::vector<int> merged(box_num, 0);unsigned int count = 0;for (unsigned int i = 0; i < box_num; ++i){if (merged[i]) continue;std::vector<lite::types::BoxfWithLandmarks> buf;buf.push_back(input[i]);merged[i] = 1;for (unsigned int j = i + 1; j < box_num; ++j){if (merged[j]) continue;float iou = static_cast<float>(input[i].box.iou_of(input[j].box));if (iou > iou_threshold){merged[j] = 1;buf.push_back(input[j]);}}output.push_back(buf[0]);// keep top kcount += 1;if (count >= topk)break;}
}void YOLOV5Face::detect(const cv::Mat& mat, std::vector<lite::types::BoxfWithLandmarks>& detected_boxes_kps,float score_threshold, float iou_threshold, unsigned int topk)
{if (mat.empty()) return;auto img_height = static_cast<float>(mat.rows);auto img_width = static_cast<float>(mat.cols);const int target_height = (int)input_node_dims.at(2);const int target_width = (int)input_node_dims.at(3);// resize & unscalecv::Mat mat_rs;ScaleParams scale_params;this->resize_unscale(mat, mat_rs, target_height, target_width, scale_params);// 1. make input tensorOrt::Value input_tensor = this->transform(mat_rs);// 2. inference scores & boxes.auto output_tensors = ort_session->Run(Ort::RunOptions{ nullptr }, input_node_names.data(),&input_tensor, 1, output_node_names.data(), num_outputs);// 3. rescale & exclude.std::vector<lite::types::BoxfWithLandmarks> bbox_kps_collection;this->generate_bboxes_kps(scale_params, bbox_kps_collection, output_tensors,score_threshold, img_height, img_width);// 4. hard nms with topk.this->nms_bboxes_kps(bbox_kps_collection, detected_boxes_kps, iou_threshold, topk);
}bool YOLOV5Face::detect(const cv::Mat& cv_src, std::vector<FaceInfo>& face_infos, int position,float score_threshold, float iou_threshold, unsigned int topk)
{auto img_height = static_cast<float>(cv_src.rows);auto img_width = static_cast<float>(cv_src.cols);const int target_height = (int)input_node_dims.at(2);const int target_width = (int)input_node_dims.at(3);// resize & unscalecv::Mat mat_rs;ScaleParams scale_params;this->resize_unscale(cv_src, mat_rs, target_height, target_width, scale_params);// 1. make input tensorOrt::Value input_tensor = this->transform(mat_rs);// 2. inference scores & boxes.auto output_tensors = ort_session->Run(Ort::RunOptions{ nullptr }, input_node_names.data(),&input_tensor, 1, output_node_names.data(), num_outputs);// 3. rescale & exclude.std::vector<lite::types::BoxfWithLandmarks> bbox_kps_collection;this->generate_bboxes_kps(scale_params, bbox_kps_collection, output_tensors,score_threshold, img_height, img_width);std::vector<lite::types::BoxfWithLandmarks> detected_boxes_kps;// 4. hard nms with topk.this->nms_bboxes_kps(bbox_kps_collection, detected_boxes_kps, iou_threshold, topk);if (detected_boxes_kps.size() == 1){FaceInfo info;info.bbox.xmin = detected_boxes_kps[0].box.rect().x;info.bbox.ymin = detected_boxes_kps[0].box.rect().y;info.bbox.xmax = detected_boxes_kps[0].box.rect().br().x;info.bbox.ymax = detected_boxes_kps[0].box.rect().br().y;info.points = detected_boxes_kps[0].landmarks.points;face_infos.push_back(info);return true;}else if (detected_boxes_kps.size() > 1 && position == 1){int arec = 0;int index = 0;for (int i = 0; i < detected_boxes_kps.size(); ++i){if (detected_boxes_kps[i].box.score >= 0.7){if (arec <= detected_boxes_kps[i].box.rect().area()){arec = detected_boxes_kps[i].box.rect().area();index = i;}}}FaceInfo info;info.bbox.xmin = detected_boxes_kps[index].box.rect().x;info.bbox.ymin = detected_boxes_kps[index].box.rect().y;info.bbox.xmax = detected_boxes_kps[index].box.rect().br().x;info.bbox.ymax = detected_boxes_kps[index].box.rect().br().y;info.points = detected_boxes_kps[index].landmarks.points;face_infos.push_back(info);return true;}return false;
}