非OCR方案,基于java:
aspose 21.11版本(网上有破解方法,或者参考我另外一篇文章)
转换pdf(含表格)为excel文件,然后可以使用poi对得到的excel文件进行微调。
但是上述方案,无法解决pdf的表格中,有比较多的横向、纵向的合并单元格的情况,例如下图这种pdf中的复杂表格
网上找到github上一位大拿的方法,使用python,对表格数据进行了识别,识别边框,单元格,然后重新构造出原始的表格内容,包括合并单元格的信息(这也会导致表格的样式,尤其是列宽和行高,并不能完全跟原表格保持一致,这里重点关注单元格和单元格的数据)
识别算法中,对原始(未进行合并单元格之前)的单元格进行恢复,识别出的图片示意图,如下
最后会构造合并单元格,然后这里的例子是输出为图片的,若是输出为excel或其他的表格格式的数据,还需要做处理,这里未给出具体实现代码。但是就这个能够完全还原出原来的复杂表格的单元格和数据,就感觉已经非常NB了。
python实现的,单元格合并识别算法参考,输出图片格式
Handling merged cells (possible solution) · Issue #84 · jsvine/pdfplumber · GitHub
识别算法图示:
使用如下的字符,来标记当前单元格的边框,上下左右四个角,以及临近单元格的信息(向哪个方向延展可以得到下一个单元格),定义这种数据结构,实现对于表格数据结构的定义和存储
上述定义的,不同数据的图示
主要代码:
https://github.com/shuratn/py_pdf_stm/blob/master/TableExtractor.py
import math
from operator import itemgetterimport pdfplumber
from PIL import ImageDraw, ImageFont, Image
from pdfplumber.table import TableFinderfrom DataSheetParsers.DataSheet import *def almost_equals(num1, num2, precision=5.0):return abs(num1 - num2) < precisionclass Point:r = 4hr = r / 2tail = 5def __init__(self, *xy):if len(xy) == 1:xy = xy[0]self.x, self.y = xyself.x = math.ceil(self.x)self.y = math.ceil(self.y)self.down = Falseself.up = Falseself.left = Falseself.right = False@propertydef symbol(self):direction_table = {(False, False, False, False): '◦',(True, False, False, False): '↑',(False, True, False, False): '↓',(True, True, False, False): '↕',(True, True, True, False): '⊢',(True, True, False, True): '⊣',(False, False, True, False): '→',(False, False, False, True): '←',(False, False, True, True): '↔',(True, False, True, True): '⊥',(False, True, True, True): '⊤',(True, True, True, True): '╋',(True, False, True, False): '┗',(True, False, False, True): '┛',(False, True, True, False): '┏',(False, True, False, True): '┛',}return direction_table[(self.up, self.down, self.right, self.left)]def __repr__(self):return "Point<X:{} Y:{}>".format(self.x, self.y)def distance(self, other: 'Point'):return math.sqrt(((self.x - other.x) ** 2) + ((self.y - other.y) ** 2))@propertydef as_tuple(self):return self.x, self.ydef draw(self, canvas: ImageDraw.ImageDraw, color='red'):canvas.ellipse((self.x - self.hr, self.y - self.hr, self.x + self.hr, self.y + self.hr), fill=color)if self.down:canvas.line(((self.x, self.y), (self.x, self.y + self.tail)), 'blue')if self.up:canvas.line(((self.x, self.y), (self.x, self.y - self.tail)), 'blue')if self.left:canvas.line(((self.x, self.y), (self.x - self.tail, self.y)), 'blue')if self.right:canvas.line(((self.x, self.y), (self.x + self.tail, self.y)), 'blue')def points_to_right(self, other_points: List['Point']):sorted_other_points = sorted(other_points, key=lambda other: other.x)filtered_other_points = filter(lambda o: almost_equals(o.y, self.y) and o != self and o.x > self.x,sorted_other_points)return list(filtered_other_points)def points_below(self, other_points: List['Point']):sorted_other_points = sorted(other_points, key=lambda other: other.y)filtered_other_points = filter(lambda o: almost_equals(o.x, self.x) and o != self and o.y > self.y,sorted_other_points)return list(filtered_other_points)def on_same_line(self, other: 'Point'):if self == other:return Falseif almost_equals(self.x, other.x) or almost_equals(self.y, other.y):return Truereturn Falsedef is_above(self, other: 'Point'):return self.y < other.ydef is_to_right(self, other: 'Point'):return self.x > other.xdef is_below(self, other: 'Point'):return self.y > other.ydef is_to_left(self, other: 'Point'):return self.x < other.xdef get_right(self, others: List['Point']):others = self.points_to_right(others)for point in others:if point.down:return pointreturn Nonedef get_bottom(self, others: List['Point'], left=False, right=False):others = self.points_below(others)for point in others:if point.up:if left:if not point.right:continueif right:if not point.left:continuereturn pointreturn Nonedef has_above(self, others: List['Point']):others = list(filter(lambda p: p.up, others))point = list(sorted(others, key=lambda p: p.y))[0]if point.is_above(self) and point.up:return Truereturn Falsedef copy(self, other: 'Point'):self.down = other.downself.up = other.upself.left = other.leftself.right = other.rightdef merge(self, other: 'Point'):self.up |= other.upself.down |= other.downself.left |= other.leftself.right |= other.rightdef __eq__(self, other: 'Point'):if not almost_equals(self.x, other.x):return Falsereturn almost_equals(self.y, other.y)def __hash__(self):return hash((self.x, self.y))class Line:def __init__(self, p1: 'Point', p2: 'Point'):self.p1 = p1self.p2 = p2self.vertical = almost_equals(self.x, self.cx)if self.vertical:if self.p1.is_above(self.p2):passelse:self.p1, self.p2 = self.p2, self.p1else:if self.p2.is_to_right(self.p1):passelse:self.p1, self.p2 = self.p2, self.p1if self.vertical:self.p1.down = Trueself.p2.up = Trueelse:self.p1.right = Trueself.p2.left = Truedef __hash__(self):return hash((self.p1, self.p2, self.vertical))@propertydef x(self):return self.p1.x@propertydef y(self):return self.p1.y@propertydef cx(self):return self.p2.x@propertydef cy(self):return self.p2.y@propertydef length(self):return self.p1.distance(self.p2)def __repr__(self):return 'Line<p1:{} p2:{} {}>'.format(self.p1, self.p2, 'vertical' if self.vertical else 'horizontal')def draw(self, canvas: ImageDraw.ImageDraw, color='blue'):x, y = self.x, self.ycx, cy = self.cx, self.cycanvas.line(((x, y), (cx, cy)), color, width=2)@propertydef as_tuple(self):return (self.x, self.y), (self.cx, self.cy)def infite_intersect(self, other: 'Line'):line1 = self.as_tupleline2 = other.as_tuplex_diff = (line1[0][0] - line1[1][0], line2[0][0] - line2[1][0])y_diff = (line1[0][1] - line1[1][1], line2[0][1] - line2[1][1]) # Typo was heredef det(point_a, point_b):return point_a[0] * point_b[1] - point_a[1] * point_b[0]div = det(x_diff, y_diff)if div == 0:return None, Noned = (det(*line1), det(*line2))x = det(d, x_diff) / divy = det(d, y_diff) / divreturn x, ydef intersect(self, other: 'Line', print_fulness=False) -> bool:""" this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB)returns a tuple: (xi, yi, valid, r, s), where(xi, yi) is the intersectionr is the scalar multiple such that (xi,yi) = pt1 + r*(pt2-pt1)s is the scalar multiple such that (xi,yi) = pt1 + s*(ptB-ptA)valid == 0 if there are 0 or inf. intersections (invalid)valid == 1 if it has a unique intersection ON the segment """point_1 = self.x, self.ypoint_2 = self.cx, self.cypoint_a = other.x, other.ypoint_b = other.cx, other.cyif self.vertical:if self.y > self.cy:if self.y >= other.y >= self.cy:passelse:return Falseelse:if other.y > other.cy:if other.y >= self.y >= other.cy:passelse:return Falsedet_tolerance = 0.0001x1, y1 = point_1x2, y2 = point_2dx1 = x2 - x1dy1 = y2 - y1x, y = point_axb, yb = point_bdx = xb - xdy = yb - ydet = (-dx1 * dy + dy1 * dx)if math.fabs(det) < det_tolerance:return Falsedet_inv = 1.0 / detr = det_inv * (-dy * (x - x1) + dx * (y - y1))s = det_inv * (-dy1 * (x - x1) + dx1 * (y - y1))if print_fulness:print('self segment', r)print('other segment', s)if r > 1 or s > 1: # can't be higher than 1, 1 means they are NOT intersectingreturn Falseif r > -0.1 and s > -0.1: # This can happen on edges, so we allow small inaccuracyreturn Truereturn Falsedef intersection(self, other: 'Line', print_fulness=False) -> (int, int):""" this returns the intersection of Line(pt1,pt2) and Line(ptA,ptB)returns a tuple: (xi, yi, valid, r, s), where(xi, yi) is the intersectionr is the scalar multiple such that (xi,yi) = pt1 + r*(pt2-pt1)s is the scalar multiple such that (xi,yi) = pt1 + s*(ptB-ptA)valid == 0 if there are 0 or inf. intersections (invalid)valid == 1 if it has a unique intersection ON the segment """point_1 = self.x, self.ypoint_2 = self.cx, self.cypoint_a = other.x, other.ypoint_b = other.cx, other.cydet_tolerance = 1x1, y1 = point_1x2, y2 = point_2dx1 = x2 - x1dy1 = y2 - y1x, y = point_axb, yb = point_bdx = xb - xdy = yb - ydet = (-dx1 * dy + dy1 * dx)if math.fabs(det) < det_tolerance:return None, Nonedet_inv = 1.0 / detr = det_inv * (-dy * (x - x1) + dx * (y - y1))s = det_inv * (-dy1 * (x - x1) + dx1 * (y - y1))xi = (x1 + r * dx1 + x + s * dx) / 2.0yi = (y1 + r * dy1 + y + s * dy) / 2.0if print_fulness:print('self segment', r)print('other segment', s)return (round(xi), round(yi)), round(r, 4), round(s, 4)def is_between(self, point: 'Point'):pt1 = self.p1pt2 = self.p2cross_product = (point.y - pt1.y) * (pt2.x - pt1.x) - (point.x - pt1.x) * (pt2.y - pt1.y)# compare versus epsilon for floating point values, or != 0 if using integersif abs(cross_product) > math.e:return Falsedot_product = (point.x - pt1.x) * (pt2.x - pt1.x) + (point.y - pt1.y) * (pt2.y - pt1.y)if dot_product < 0:return Falsesquared_length_ba = (pt2.x - pt1.x) * (pt2.x - pt1.x) + (pt2.y - pt1.y) * (pt2.y - pt1.y)if dot_product > squared_length_ba:return Falsereturn Truedef on_line(self, point: 'Point'):if self.vertical:if almost_equals(self.p1.x, point.x):return Trueelse:if almost_equals(self.p1.y, point.y):return Truereturn Falsedef __contains__(self, other: {'Line', 'Point'}):if type(other) == Line:if self.vertical == other.vertical:return Falsereturn self.intersect(other)if type(other) == Point:return self.is_between(other)passdef on_same_line(self, other: 'Line'):if other.vertical != self.vertical:return Falseif self.vertical:return self.x == other.xelse:return self.y == other.ydef __eq__(self, other: 'Line'):return self.on_same_line(other)def corner(self, other: 'Line'):if self.p1 == other.p1 or self.p2 == other.p2 or self.p1 == other.p2:return Truereturn Falsedef connected(self, other: 'Line'):return other.p1 in self or other.p2 in selfdef parallel(self, other: 'Line'):return self.vertical == other.verticaldef on_corners(self, other: 'Point'):return other == self.p1 or other == self.p2def test_intersection(self, other: 'Line'):""" prints out a test for checking by hand... """print('Testing intersection of:')print('\t', self)print('\t', other)result = self.intersection(other, True)print("\t Intersection result =", Point(result[0]))print()class Cell:"""P1-------P2| || || || |P4-------P3"""try:font = ImageFont.truetype('arial', size=9)except:font = ImageFont.load_default()def __init__(self, p1, p2, p3, p4):self.p1: Point = p1self.p2: Point = p2self.p3: Point = p3self.p4: Point = p4self.text = ''self.words = [] # type: List[str]def __repr__(self):return 'Cell <"{}"> '.format(self.text.replace('\n', ' '))def get_text(self):return ''.join(map(itemgetter('text'),self.words))@propertydef clean_text(self) -> str:return self.text.replace('\n', ' ')def __hash__(self):return hash(self.text) + hash(self.as_tuple)def on_same_line(self, other: 'Cell'):return self.p1.on_same_line(other.p1)def on_same_row(self, other: 'Cell'):return self.p1.y == other.p1.y@propertydef as_tuple(self):return self.p1.as_tuple, self.p2.as_tuple, self.p3.as_tuple, self.p4.as_tupledef __eq__(self, other: 'Cell'):if self.p1 == other.p1 and self.p2 == other.p2 and self.p3 == other.p3 and self.p4 == other.p4:return Trueif self.p1 == other.p2 and self.p2 == other.p3 and self.p3 == other.p4 and self.p4 == other.p1:return Trueif self.p1 == other.p3 and self.p2 == other.p4 and self.p3 == other.p1 and self.p4 == other.p2:return Trueif self.p1 == other.p4 and self.p2 == other.p1 and self.p3 == other.p2 and self.p4 == other.p3:return True@propertydef center(self):x = [p.x for p in [self.p1, self.p2, self.p3, self.p4]]y = [p.y for p in [self.p1, self.p2, self.p3, self.p4]]centroid = Point(sum(x) / 4, sum(y) / 4)return centroiddef draw(self, canvas: ImageDraw.ImageDraw, color='black', width=1, text_color='black'):# canvas.rectangle((self.p1.as_tuple, self.p3.as_tuple), outline=color,)canvas.line((self.p1.as_tuple, self.p2.as_tuple), color, width)canvas.line((self.p2.as_tuple, self.p3.as_tuple), color, width)canvas.line((self.p3.as_tuple, self.p4.as_tuple), color, width)canvas.line((self.p4.as_tuple, self.p1.as_tuple), color, width)if self.text:canvas.text((self.p1.x + 3, self.p1.y + 3), self.text, fill=text_color, font=self.font)def print_cell(self):buffer = ''longest = max([len(word) for word in self.text.split("\n")])buffer += '┼' + "─" * longest + '┼\n'for text_line in self.text.split('\n'):buffer += "│" + text_line + ' ' * (longest - len(text_line))buffer += "│\n"buffer += '┼' + "─" * longest + '┼\n'print(buffer)def point_inside_polygon(self, point: 'Point', include_edges=True):"""Test if point (x,y) is inside polygon poly.poly is N-vertices polygon defined as[(x1,y1),...,(xN,yN)] or [(x1,y1),...,(xN,yN),(x1,y1)](function works fine in both cases)Geometrical idea: point is inside polygon if horizontal beamto the right from point crosses polygon even number of times.Works fine for non-convex polygons."""x, y = point.as_tuplex1, y1 = self.p1.as_tuplex2, y2 = self.p3.as_tuplereturn x1 < x < x2 and y1 < y < y2class Table:def __init__(self, cells: List[Cell], skeleton: List[List[Cell]], ugly_table: List[List[str]], words, canvas=None):self.cells = cellsself.canvas = canvasself.words = wordsself.skeleton = skeletonself.ugly_table = ugly_tableself.global_map = {}def build_table(self):for y, (text_row, skeleton_row) in enumerate(zip(self.ugly_table, self.skeleton)):self.global_map[y] = {}for x, (text, cell) in enumerate(zip(text_row, skeleton_row)):for t_cell in self.cells:if t_cell.point_inside_polygon(cell.center):t_cell.text += text if text else ''self.global_map[y][x] = t_cellprocessed_cells = []for cell in tqdm(self.cells, desc='Analyzing cells', unit='cells'):if cell in processed_cells:continuein_words = list(filter(lambda char: cell.point_inside_polygon(Point(char['x0'], char['top'])), self.words))cell.words = in_wordsprocessed_cells.append(cell)if self.canvas:for cell in self.cells:# print(cell.get_text())cell.draw(self.canvas)def get_col(self, col_id) -> List[Cell]:col = []for row in self.global_map.values():col.append(row[col_id])return coldef get_row(self, row_id) -> List[Cell]:return list(self.global_map[row_id].values())def get_cell(self, x, y) -> Cell:return self.global_map[y][x]def get_cell_span(self, cell):temp = {}for row_id, row in self.global_map.items():for col_id, t_cell in row.items():if t_cell == cell:if not temp.get(row_id, False):temp[row_id] = {}temp[row_id][col_id] = Truerow_span = len(temp)col_span = len(list(temp.values())[0])return row_span, col_spanclass TableExtractor:def __init__(self, path):self.pdf = pdfplumber.open(path)self.draw = Falseself.debug = False@staticmethoddef filter_lines(lines: List[Line]):new_lines = []lines = list(set(lines))la = new_lines.appendfor line1 in tqdm(lines, desc='Filtering lines', unit='lines'):if line1 in new_lines:continuela(line1)new_lines = list(set(new_lines))return new_lines@staticmethoddef add_skeleton_points(points, line):points.append(line.p1)points.append(line.p2)def build_skeleton(self, lines):skeleton_points = []skeleton = []temp_point = Point(0, 0)temp_point.down = temp_point.up = temp_point.left = temp_point.right = Truevertical = list(filter(lambda l: l.vertical, lines))horizontal = list(filter(lambda l: not l.vertical, lines))for line1 in tqdm(vertical, desc='Building table skeleton', unit='lines'):sys.stdout.flush()if line1.length < 3.0:continueself.add_skeleton_points(skeleton_points, line1)for line2 in horizontal:if line1 == line2:continueself.add_skeleton_points(skeleton_points, line2)if line1.infite_intersect(line2):p1 = Point(line1.infite_intersect(line2))if p1 not in skeleton_points:skeleton_points.append(p1)for n, p in enumerate(skeleton_points):skeleton_points[n].copy(temp_point)if p == p1:p1.copy(p)skeleton_points[n] = p1skeleton_points = list(set(skeleton_points))sorted_y_points = sorted(skeleton_points, key=lambda other: other.y)for p1 in tqdm(sorted_y_points, desc='Building skeleton cells', unit='point'):p2 = p1.get_right(skeleton_points)if p2:p3 = p2.get_bottom(skeleton_points, right=True)p4 = p1.get_bottom(skeleton_points, left=True)if p3 and p4:cell = Cell(p1, p2, p3, p4)if cell not in skeleton:skeleton.append(cell)else:continuereturn skeleton_points, skeleton@staticmethoddef skeleton_to_2d_table(skeleton: List[Cell]) -> List[List[Cell]]:rows = []for cell in tqdm(skeleton, desc='Analyzing cell positions', unit='cells'):row = tuple(sorted(filter(lambda c: cell.on_same_row(c), skeleton), key=lambda c: c.p1.x))rows.append(row)rows = list(sorted(list(set(rows)), key=lambda c: c[0].p1.y))rows = [list(row) for row in rows]return rowsdef parse_page(self, page_n):if self.debug:print('Parsing page', page_n)page = self.pdf.pages[page_n]if self.debug:print('Rendering page')if self.debug:print('Finding tables')tables = TableFinder(page, {'snap_tolerance': 3, 'join_tolerance': 3})if self.debug:print('Found', len(tables.tables), 'tables')beaut_tables = []if self.draw:p_im = page.to_image(resolution=100)p_im.draw_lines(page.lines)p_im.save('page-{}-lines.png'.format(page_n + 1))if len(tables.tables) > 5:return []for n, table in enumerate(tables.tables):if self.draw:p_im.reset()im = Image.new('RGB', (page.width, page.height), (255,) * 3)canvas = ImageDraw.ImageDraw(im)ugly_table = table.extract()lines = [] # type: List[Line]cells = [] # type: List[Cell]for cell in tqdm(table.cells, desc='Parsing cells', unit='cells'):# p_im.draw_rect(cell)x1, y1, x2, y2 = cellp1 = Point(x1, y1)p1.right = Truep1.down = Truep2 = Point(x2, y1)p2.left = Truep2.down = Truep3 = Point(x2, y2)p3.up = Truep3.left = Truep4 = Point(x1, y2)p4.up = Truep4.right = Trueline1 = Line(p1, p2)line2 = Line(p2, p3)line3 = Line(p3, p4)line4 = Line(p4, p1)lines.append(line1)lines.append(line2)lines.append(line3)lines.append(line4)cell = Cell(p1, p2, p3, p4)cells.append(cell)# for line in lines:# p_im.draw_line(line.as_tuple)lines = self.filter_lines(lines)# for line in lines:# line.draw(canvas, color='green')if self.draw:p_im.save('page-{}-{}_im.png'.format(page_n + 1, n))im.save('page-{}-{}.png'.format(page_n + 1, n))skeleton_points, skeleton = self.build_skeleton(lines.copy())if not skeleton_points:continueskeleton = self.skeleton_to_2d_table(skeleton)# for p in points:# p.draw(canvas)beaut_table = Table(cells, skeleton, ugly_table, page.extract_words())beaut_table.build_table()if self.draw:for cell in beaut_table.cells:cell.draw(canvas)if self.debug:print('Saving rendered table')if self.draw:p_im.save('page-{}-{}_im.png'.format(page_n + 1, n))im.save('page-{}-{}.png'.format(page_n + 1, n))if self.draw:canvas.rectangle((0,0,page.width,page.height),fill='white') #cleaning canvasfor row_id, row in enumerate(skeleton):for cell_id, cell in enumerate(row):cell.text = '{}-{}'.format(row_id, cell_id)cell.draw(canvas, color='green',text_color='red')im.save('page-{}-{}-skeleton.png'.format(page_n + 1, n))beaut_tables.append(beaut_table)return beaut_tables# def pdfplumber_table_to_table():if __name__ == '__main__':# datasheet = DataSheet(r"D:\PYTHON\py_pdf_stm\datasheets\stm32L\stm32L431\stm32L431_ds.pdf")# pdf_interpreter = PDFInterpreter(r"/mnt/d/PYTHON/py_pdf_stm/datasheets/stm32L/stm32L476/stm32L476_ds.pdf")# pdf_interpreter = TableExtractor(r"D:\PYTHON\py_pdf_stm\datasheets\stm32L\stm32L476\stm32L476_ds.pdf")# pdf_interpreter = PDFInterpreter(r"/mnt/d/PYTHON/py_pdf_stm/datasheets/KL/KL17P64M48SF6_ds.pdf")pdf_interpreter = TableExtractor(r"D:\PYTHON\py_pdf_stm\datasheets\STM32F\stm32f777.pdf")# pdf_interpreter = PDFInterpreter(r"D:\PYTHON\py_pdf_stm\datasheets\KL\KL17P64M48SF6_ds.pdf")pdf_interpreter.draw = Truepdf_interpreter.debug = True# pdf_interpreter = PDFInterpreter(pdf.table_root.childs[table])# print(pdf_interpreter.content)# tables = pdf_interpreter.parse_page(5)tables = pdf_interpreter.parse_page(16)print(tables)# pdf_interpreter.parse_page(1)# pdf_interpreter.save()# pdf_interpreter.table.print_table()