不是很精确、有点慢,但是也够用,胜在免费free
效果图:
一些对比:
模型来自于:
https://huggingface.co/models
文件目录
调用模型的代码:
运行此段代码,执行翻译
一些简单的设置在这里控制
pdf2chines.py
import osimport cv2
import easyocr
from PIL import Image
from PIL import Image, ImageDraw, ImageFontimport rect_dealer
from img_text import ImgTextcut_model_path = r"F:\ocr\cut_model"
detect_model_jap_path = r"F:\ocr\meta_model\manga-ocr-base" # 检测漫画的文本用的,好烂,还不如easyocr
trans_model_path = r"F:\ocr\meta_model\m2m100_1.2B" # meta的模型
pdf2png_save_path = r"F:\ocr\pdf2png"
pdf_path = r"F:\ocr\pdfs"
pass_point = 0.05
blank_png_path = r"F:\ocr\blank.png"
DEFUALT_FONT_SIZE = 60
MIN_FONT_SIZE = 20height_sub = 0.1 # 检测到位置后,高度减少一丢丢来找每个文本块
include_height_sub = 0.3
include_width_sub = 0.3
finished_list = "finished_list.txt"def generate_mask(png, graph_infos):"""生成一张mask图:param png::param graph_infos::return:"""image = Image.open(png)im_width, im_height = image.sizefill_image = Image.new('RGBA', (im_width, im_height), (0, 0, 0))image.paste(fill_image, (0, 0)) # 全搞成黑色的for info in graph_infos:pos_info = info[0]left_up_point = pos_info[0] # [939, 791]left_down_point = pos_info[3] # [939, 805]right_up_point = pos_info[1] # [1007, 791]right_down_point = pos_info[2] # [1007, 805]up_margin = left_up_point[0] # 上间距left_margin = left_up_point[1] # 左侧间距width = right_up_point[0] - left_up_point[0]height = right_down_point[1] - right_up_point[1]height_sub_num = height * 0.1blank_png = Image.new('RGBA', (int(width), int(height - height_sub_num)), (255, 255, 255))image.paste(blank_png, (int(up_margin + height_sub_num), int(left_margin)))image.save("{}_filled.png".format(png))return "{}_filled.png".format(png)def merge_neighbers(png, graph_infos):"""需要把邻近的行都合并了:param graph_infos::return:"""filled_path = generate_mask(png, graph_infos) # 生成mask图rects = rect_dealer.getHoleRects(filled_path) # 获取分割关系for info in graph_infos:# 检测包含关系detect_include(rects, info[0], info[-2], info[-1])return rectsdef detect_include(rects, pos_info, words, acc):left_up_point = pos_info[0] # [939, 791]left_down_point = pos_info[3] # [939, 805]right_up_point = pos_info[1] # [1007, 791]right_down_point = pos_info[2] # [1007, 805]up_margin = left_up_point[0] # 上间距left_margin = left_up_point[1] # 左侧间距width = right_up_point[0] - left_up_point[0]height = right_down_point[1] - right_up_point[1]height_sub_num = min(height * include_height_sub, 20)width_sub_num = min(include_width_sub * width, 10)for rect in rects:if width - width_sub_num < rect.w and height - height_sub_num < rect.h:print("minus:{},{}".format(rect, pos_info))if width - width_sub_num < rect.w and height - height_sub_num < rect.h and left_up_point[0] > \rect.x - width_sub_num and left_up_point[1] > rect.y - height_sub_num:rect.words += wordsrect.acc += float(acc)rect.acc /= 2.0rect.line_num += 1 # 行数+1# print("include:{},{}".format(rect, pos_info))returndef change_graph2words(graph_path, languages):"""图片转成词:param graph_path::param languages::return:"""reader = easyocr.Reader(languages, model_storage_directory=cut_model_path, download_enabled=False, gpu=True)result = reader.readtext(graph_path)return resultdef words2chinese(words, from_lang, tgt_lang):from transformers import pipelinetranslator = pipeline("translation", model=trans_model_path)to_trans = "".join(words)output = translator(to_trans, src_lang=from_lang, tgt_lang=tgt_lang)print("翻译原文:{}\n翻译结果:{}".format(to_trans, output))return outputdef pdf2png(pdf_name):import fitz# 打开PDF文件,生成一个对象doc = fitz.open('{}'.format(pdf_name))png_paths = []for pg in range(doc.page_count):page = doc[pg]rotate = int(0)# 每个尺寸的缩放系数为2,这将为我们生成分辨率提高四倍的图像。zoom_x = 1.0zoom_y = 1.0trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)pm = page.get_pixmap(matrix=trans, alpha=False)graph_path = os.path.join(pdf2png_save_path, '%s.png' % pg)pm.save(graph_path, output="png")png_paths.append(graph_path)return png_pathsdef line_sep(sentense, line_num):sep = int(len(sentense) / line_num)new_sen = ""next_start_index = 0for i in range(0, line_num):new_sen += sentense[next_start_index:line_num + sep].strip()new_sen += "\n"next_start_index = line_num + sepnew_sen += sentense[next_start_index:]return new_sendef draw_text(png, infos):image = Image.open(png)for info in infos:blank_png = Image.new('RGBA', (info.w, info.h), (255, 255, 255))image.paste(blank_png, (info.x, info.y))n = ImgText(info.words, max(min(int(min(info.w, info.h) / (1.5 * info.line_num)), DEFUALT_FONT_SIZE), MIN_FONT_SIZE), info.w)n.draw_text(image, info.x, info.y)image.save("{}".format(png))def clear_png_files():passdef translate_a_pdf(pdf_path, detectlang: list, translate_from_lang: str, translate_to_lang: str):with open(finished_list, "r") as f:finished = f.readlines()for finish in finished:finished[finished.index(finish)] = finish.strip()clear_png_files() # 先清空png文件夹下面的全部图,然后就可以转换当前pdf的图了png_paths = pdf2png(pdf_path)for png in png_paths:if png in finished:continueimg_changes = []words_result = change_graph2words(png, detectlang)print("查找到的文本:{}".format(words_result))rects = merge_neighbers(png, words_result) # 合并段for rect in rects:if float(rect.acc) < pass_point:print("认为这个词正确度{}极低,不进行翻译:{}".format(rect.acc, rect.words))continuetransed_words = words2chinese(rect.words, translate_from_lang, translate_to_lang)translation_text = ""for trans in transed_words:translation_text += trans["translation_text"]rect.words = translation_textprint("存储位置:{}".format(str(rect)))img_changes.append(rect) # 更新一下图像数据draw_text(png, img_changes)draw_text(png, img_changes)print("输出图片:{}".format(png))with open("finished_list.txt", "a+") as f:f.write(png + "\n")from PIL import Image
import osdef combine_imgs_pdf(folder_path, pdf_file_path):"""合成文件夹下的所有图片为pdfArgs:folder_path (str): 源文件夹pdf_file_path (str): 输出路径"""with open(finished_list,"r") as f:png_list = f.readlines()for png in png_list:png_list[png_list.index(png)] = png.strip()sources = []png_list.sort()output = Image.open(png_list[0])png_list.pop(0)for file in png_list:png_file = Image.open(file)if png_file.mode == "RGB":png_file = png_file.convert("RGB")sources.append(png_file)output.save(pdf_file_path, "pdf", save_all=True, append_images=sources)with open(finished_list,"w") as f:f.write("")if __name__ == '__main__':from_lang = ["ja", "en"]to_lang = ["zh"]pdf_name = "ポーズの定理_ダイジェスト.pdf"translate_a_pdf(os.path.join(pdf_path, pdf_name), from_lang, "ja", "zh")combine_imgs_pdf(pdf2png_save_path, os.path.join(pdf_path, "changed_"+pdf_name))
处理一下一些段落,按照段落去识别
rect_dealer.py
import mathimport cv2
from PIL import Image
from PIL import Image, ImageDraw, ImageFont# 定义一个边界表示
class Rec:def __init__(self, x, y, w, h):self.x = xself.y = yself.w = wself.h = hself.words = ""self.acc = 0self.line_num = 0def __str__(self): # __str__(self)不可以添加参数(形参)return "x:" + str(self.x) + " y:" + str(self.y) + " w:" + str(self.w) + " h:" + str(self.h) + " words: " + str(self.words)def __repr__(self):return "x:" + str(self.x) + " y:" + str(self.y) + " w:" + str(self.w) + " h:" + str(self.h) + " words: " + str(self.words)def include_other_recs(rec_in: Rec, recs):"""比较矩形REC:rec_in和矩形数组:recs比较是否包含其他矩形,如果包含了,返回Ture ,否则返回False,表示不包含其他矩形区域,是单独的表格:param rec_in::param recs::return:"""for rec in recs:if rec_in != rec:if rec_in.x <= rec.x and rec_in.x + rec_in.w >= rec.x + rec.w and rec_in.y <= rec.y \and rec_in.y + rec_in.h >= rec.y + rec.h + 5:# print(str(rec) + " in " + str(rec_in))return True# print(str(rec_in), "------not include other recs------")return Falsedef hole_select(recs):results = []for rec in recs:if not include_other_recs(rec, recs):results.append(rec)return resultsclass detectWords(object):def __init__(self, src_img, width_max_scale=15, height_max_scale=15):self.src_img = src_imgself.width_scale = width_max_scaleself.height_scale = height_max_scaledef run(self):if len(self.src_img.shape) == 2: # 灰度图gray_img = self.src_imgif len(self.src_img.shape) == 3:gray_img = cv2.cvtColor(self.src_img, cv2.COLOR_BGR2GRAY)# 处理图像,灰度化,二值化# erode_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (self.height_scale,self.width_scale))dilated_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (self.height_scale,self.width_scale))# eroded = cv2.erode(gray_img.copy(), erode_kernel, 3)dilated = cv2.dilate(gray_img.copy(), dilated_kernel, 10)return dilated# 判断是否区域为表格,返回可能包含表格的矩形若干个(它们可能存在重叠包含关系):
def region_hole(image):recs = [] # 保存表格结果矩形contours_mask, hierarchy_mask = cv2.findContours(image, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)draw_img_in = cv2.drawContours(image.copy(), contours_mask, -1, (153, 153, 0), 2, maxLevel=2)cv2.imwrite("region_table.png", draw_img_in)for contour in contours_mask: # 遍历轮廓# 只保留需要的轮廓,去掉误读的噪点 和 外轮廓# 绘制矩形area = cv2.contourArea(contour)if area < 150:# 获取区域的面积,如果小于某个值就忽略,代表是杂线不是表格continueapprox = cv2.approxPolyDP(contour, 3, True) # 趋近矩形x, y, width, height = cv2.boundingRect(approx) # 得到矩形面积、rec = Rec(x, y, width, height)recs.append((rec))return recsdef draw_rects(png, recs):image = Image.open(png)im_width, im_height = image.sizefill_image = Image.new('RGBA', (im_width, im_height), (0, 0, 0))image.paste(fill_image, (0, 0)) # 全搞成黑色的for info in recs:# print(info)blank_png = Image.new('RGBA', (info.w, info.h), (255, 255, 255))image.paste(blank_png, (info.x, info.y))image.save("{}_filled.png".format(png))def getHoleRects(png_path):origin_image = cv2.imread(png_path)h_dilated_img = detectWords(origin_image).run() # 稍微膨胀recs = region_hole(h_dilated_img) # 检测候选洞区域results = hole_select(recs) # 筛选出洞draw_rects('region_table.png', results)return resultsif __name__ == '__main__':file_name = r'F:\ocr\pdf2png\1.png_filled.png'getHoleRects(file_name)
img_text.py (这段代码抄的网上、实现了图片文本换行的效果)
from PIL import Image, ImageDraw, ImageFontclass ImgText:def __init__(self, text, font_size, width):self.font = ImageFont.truetype(r'C:\Windows\Fonts\simhei.ttf', font_size)# 预设宽度 可以修改成你需要的图片宽度self.width = width# 文本self.text = text# 段落 , 行数, 行高self.duanluo, self.note_height, self.line_height = self.split_text()def get_duanluo(self, text):txt = Image.new('RGBA', (100, 100), (255, 255, 255, 0))draw = ImageDraw.Draw(txt)# 所有文字的段落duanluo = ""# 宽度总和sum_width = 0# 几行line_count = 1# 行高line_height = 0for char in text:width, height = draw.textsize(char, self.font)sum_width += widthif sum_width > self.width: # 超过预设宽度就修改段落 以及当前行数line_count += 1sum_width = 0duanluo += '\n'duanluo += charline_height = max(height, line_height)if not duanluo.endswith('\n'):duanluo += '\n'return duanluo, line_height, line_countdef split_text(self):# 按规定宽度分组max_line_height, total_lines = 0, 0allText = []for text in self.text.split('\n'):duanluo, line_height, line_count = self.get_duanluo(text)max_line_height = max(line_height, max_line_height)total_lines += line_countallText.append((duanluo, line_count))line_height = max_line_heighttotal_height = total_lines * line_heightreturn allText, total_height, line_heightdef draw_text(self,note_img,x,y):"""绘图以及文字:return:"""draw = ImageDraw.Draw(note_img)# 左上角开始for duanluo, line_count in self.duanluo:draw.text((x, y), duanluo, fill=(255, 0, 0), font=self.font)y += self.line_height * line_countnote_img.save("result.png")
步骤:
1.先用easyocr识别文本,easyocr需要下载easyocr的模型,放在cut_model文件夹里
下载地址:https://www.jaided.ai/easyocr/modelhub/ 可能需要科学上w、
2.在这里可以控制easyocr识别的文本语言:
我这里输入ja、en,代表日语(japanese)和英语(english),所以会从图片中检测出日语和英语的文本
3.简单地处理一下块,把一个段落的文本,合并起来
4.输入到翻译模型中,这里可以是任何模型,我试过下面几个模型
绿色框住的是好,其他的由于各种原因,比如太慢、比如性能太差,被我残忍抛弃,
(ps:opus-mt-XX的模型是真的好用,又小又准确,但是它!没有ja-zh,所以……好气!)
例如:m2m100_418M,这个模型在:https://toscode.gitee.com/mirrors_UKPLab/EasyNMT 可以看到,
它的节点和大小没有m2m100_1.2B多,我下载了试了试,真的不能用
这俩的翻译对比:m2m100_418M,右边m2m100_1.2B
性能差了很多,而且会出现奇怪的表现,速度也没有快多少。
模型排行榜:
(排行靠前的一大堆,没一个开源的,我只能说,感谢meta,小扎还是良心企业嗷)
网易有道词典小语种翻译实现思路
网易有道的小语种翻译真的很牛,微信在它面前被揍得像个弟弟,可惜模型都不公开,毕竟都是核心资源……
其他语种模型可以去下面的笑脸中心找,很牛的企业,可能需要科学上网,模型太大的话可以用迅雷下载器(或者用别的下载器),
下载器下载能快许多:
也可以用讯飞的api直接就翻译日语了
https://www.xfyun.cn/services/xftrans
给的200万字免费调用,够用一段时间了
m2m100_1.2B模型翻译日文还是有很多不如人意的地方,
例如:
1.速度很慢:慢的我有点受不了了
2.正确率还不够好(虽然也不太差了):
—————————————————————————————
后来换了讯飞的接口试了下,也不怎么样(调用接口还很麻烦)
讯飞翻译:
唯一好使的只有有道图片翻译,感觉错误率明显低;而且提供了任意体验的服务,真的很好,如果不是想一键pdf2pdf,那么用有道去翻译一下也可以。