对比表格数据
from docx import Documentdoc = Document( "table.docx" ) table_data = [ ]
for table in doc. tables: for row in table. rows: table_data. append( [ cell. text for cell in row. cells] ) print ( "table data:" , table_data) data1 = [ [ '省份' , '城市' , '地区' , '小区' ] , [ '河南省' , '郑州市' , '东区' , '碧桂园' ] , [ '河北省' , '天津市' , '四环区' , '瑞丰锦尚' ] , [ '河北省' , '未知市' , '未知区' , '0' ] , [ '山西省' , '晋城市' , '长垣县' , '0' ] ,
] data2 = [ [ '省份' , '城市' , '地区' , '小区' ] , [ '河南省' , '郑州市' , '东区' , '碧桂园' ] , [ '河北省' , '天津市' , '四环区' , '瑞丰锦尚' ] , [ '山西省' , '晋城市' , '长垣县' , '0' ] , [ '山西省' , '未知市' , '未知区' , '0' ]
]
m = len ( data1)
n = len ( data2)
from difflib import SequenceMatcher
import numpy as np
matrix = np. zeros( ( m, n) )
for i in range ( m) : seq1 = "" . join( data1[ i] ) for j in range ( n) : matrix[ i] [ j] = SequenceMatcher( None , seq1, "" . join( data2[ j] ) ) . ratio( )
data1_max_proba = np. max ( matrix, axis= 1 )
arr = np. argwhere( data1_max_proba < 0.82 )
print ( "删除的行:" , arr)
data11 = [ [ '省份' , '城市' , '地区' , '小区' ] , [ '河南省' , '郑州市' , '东区' , '碧桂园' ] , [ '河北省' , '天津市' , '四环区' , '瑞丰锦尚' ] , [ '' , '未知市' , '未知区' , '0' ] , [ '山西省' , '未知市' , '未知区' , '0' ]
] data22 = [ [ '省份' , '城市' , '地区' , '小区' ] , [ '河南省' , '郑州市' , '东区' , '碧桂园' ] , [ '河北省' , '天津市' , '四环区' , '瑞丰锦尚' ] , [ '山西省' , '晋城市' , '长垣县' , '0' ] , [ '' , '未知市' , '未知区' , '0' ]
]
matrix2 = np. zeros( ( n, m) )
for i in range ( n) : seq2 = "" . join( data2[ i] ) for j in range ( m) : matrix2[ i] [ j] = SequenceMatcher( None , seq2, "" . join( data1[ j] ) ) . ratio( ) arr2 = np. max ( matrix2, axis= 1 )
arr2 = np. argwhere( arr2 < 0.82 )
print ( "增加的行:" , arr2)