1.多字段连接,连接字段名不一致–left_on\right_on对应列示后可匹配
import pandas as pd
df_A1= pd. read_excel( 'E:\Mercy\data\mytest\A.xlsx' , sheet_name= 0 )
df_A2= pd. read_excel( 'E:\Mercy\data\mytest\A.xlsx' , sheet_name= 1 )
df_A= df_A1. merge( right= df_A2, how= 'left' , left_on= [ '公司名' , '业务主体' ] , right_on= [ '公司名' , '业务线' ] )
df_A
公司名 业务主体 业务线 金额 0 南京 成长 成长 1 1 天津 成长 成长 2 2 北京 成长 成长 3 3 南京 育儿 育儿 4
2.merge一对多left–效果同sql里的left join
(一侧或被多侧重复,右侧会被左侧剔除)
import pandas as pd
df_A2= pd. read_excel( 'E:\Mercy\data\mytest\A.xlsx' , sheet_name= 1 )
df_A3= pd. read_excel( 'E:\Mercy\data\mytest\A.xlsx' , sheet_name= 2 )
df_A22= df_A2. iloc[ : , [ 0 , 2 ] ]
df_A3. merge( df_A22, how= 'right' , on= '公司名' )
3.xlsxwriter写入excel文本和图片
import pandas as pd
import xlsxwriter as xlsx
year_month= input ( '请输入计算期年月(YYYY_MM):' )
book= xlsx. Workbook( 'E:\\Mercy\\data\\mytest\\' + year_month+ '测试.xlsx' )
sheet= book. add_worksheet( 'demo' )
pm = { 'bold' : True , 'font_name' : '微软雅黑' , 'color' : 'red'
}
formate = book. add_format( pm)
amount= 56
sheet. write( 'A1' , '实际支付金额:' + str ( amount) , formate)
numform_int= book. add_format( { 'num_format' : '0' } )
df_A2= pd. read_excel( 'E:\Mercy\data\mytest\A.xlsx' , sheet_name= 1 )
rown= 9
for index, row in df_A2. iterrows( ) : sheet. write_string( rown, 0 , row[ df_A2. columns[ 0 ] ] ) sheet. write_string( rown, 1 , row[ df_A2. columns[ 1 ] ] ) sheet. write_number( rown, 2 , row[ df_A2. columns[ 2 ] ] , numform_int) rown += 1
import matplotlib. pyplot as plt
import numpy as np
from io import BytesIO
x = np. linspace( 0 , 10 , 1000 )
plt. plot( x, np. sin( x) )
imagedata= BytesIO( )
plt. savefig( imagedata)
sheet. insert_image( 0 , 0 , '' , { "image_data" : imagedata} ) book. close( )
请输入计算期年月(YYYY_MM):20
4.读取模糊文件名
import sys
import re
import pandas as pd
import ospat= re. compile ( '全渠道.*?' )
for file_name in os. listdir( 'E:\Mercy\data\public' ) : if re. match ( pat, file_name) : depart_path= 'E:\Mercy\data\public\\' + file_name
df_depart= pd. read_excel( depart_path, sheet_name= '门店名称匹配' , header= 1 )
df_depart= df_depart. drop( '数值' , axis= 1 )
df_depart. rename( columns= { '文本' : '门店' } , inplace= True )
df_depart[ '门店' ] = df_depart[ '门店' ] . apply ( lambda x: str ( x) )
df_depart. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 17 columns):# Column Non-Null Count Dtype
--- ------ -------------- ----- 0 门店 445 non-null object 1 管报名称 445 non-null object 2 BI名称 445 non-null object 3 会员KPI名称 445 non-null object 4 分部 445 non-null object 5 地级市 444 non-null object 6 城市等级 444 non-null object 7 店总 444 non-null object 8 开店日期 445 non-null datetime64[ns]9 开店日期.1 445 non-null object 10 年级 445 non-null object 11 规模店 444 non-null object 12 物业类型 419 non-null object 13 属性 444 non-null object 14 计租方式 346 non-null object 15 使用面积 431 non-null float64 16 面积划分 385 non-null object
dtypes: datetime64[ns](1), float64(1), object(15)
memory usage: 59.2+ KB
5.一串字符串是否包含某些关键词的任意一个any|全部all
ls1= '服务-在线教育-家庭教育-能力启蒙;服务-在线教育-家庭教育-性格培养;服务-在线教育-家庭教育-亲子成长;服务-教育培训-早教-早教中心;'
ls2= '服务-才艺运动-运动-篮球;'
ls3= '服务-在线教育-家庭教育-能力启蒙;服务-在线教育-家庭教育-性格培养;'
kwl= [ '性格培养' , '亲子成长' ]
kw= '性格培养'
any ( kw in ls3 for kw in kwl)
all ( kw in ls3 for kw in kwl)
kw in ls1
ls2. find( kw) > - 1
-1
for i in range ( len ( df_guiderule) ) : print ( '.*' + df_guiderule. iloc[ i, 0 ] + '.*' , df_guiderule. iloc[ i, 1 ] )
6.找到文件夹下最新文件
import os
def find_new_file ( dir ) : '''查找目录下最新的文件''' file_lists = os. listdir( dir ) file_lists. sort( key= lambda fn: os. path. getmtime( dir + "\\" + fn) if not os. path. isdir( dir + "\\" + fn) else 0 ) print ( '最新的文件为: ' + file_lists[ - 1 ] ) file = os. path. join( dir , file_lists[ - 1 ] ) print ( '完整路径:' , file ) return file dir = r'E:\WXWork\1688853870925791\WeDrive\孩子王\战略支持-数字化分析\1、制度流程规则\门店信息表'
find_new_file( dir )
7.解决pandas to_excel覆盖原sheet
from openpyxl import load_workbook
import pandas as pd
topath= 'E:\Mercy\data\public\门店test.xlsx'
depart_path= 'E:\Mercy\data\public\门店名称匹配.xlsx' df_prov= pd. read_excel( depart_path, sheet_name= '分部汇总' , header= 0 , usecols= 'A:D,F:K' , nrows= 22 )
df_prov. to_excel( topath, sheet_name= 'Sheet2' , index= False )
from openpyxl import load_workbook
import pandas as pd
topath= 'E:\Mercy\data\public\门店test.xlsx'
depart_path= 'E:\Mercy\data\public\门店名称匹配.xlsx' target_writer = pd. ExcelWriter( topath, engine= 'openpyxl' )
book = load_workbook( target_writer. path)
target_writer. book = book
df_prov= pd. read_excel( depart_path, sheet_name= '分部汇总' , header= 0 , usecols= 'A:D,F:K' , nrows= 22 )
df_prov. to_excel( excel_writer= target_writer, sheet_name= 'Sheet2' , index= False )
target_writer. save( )
target_writer. close( )
8.制作词云
filename = 'E:\Mercy\data\mytest\wordcloudtest.txt'
with open ( filename, encoding= 'utf-8' ) as f: text = f. read( )
import jieba
jieba. setLogLevel( jieba. logging. INFO)
text = jieba. cut( text)
text
<generator object Tokenizer.cut at 0x0000018F0935EF90>
stopword= [ '的' , '是' ]
stayed_line = ""
for te in text: if te not in stopword: stayed_line += te+ " "
import imageio
graph = imageio. imread( "E:\Mercy\data\mytest\star.png" ) '''
import numpy as np
from PIL import Image
image = Image.open(r'E:\Mercy\data\mytest\star.png')
graph = np.array(image)'''
"\nimport numpy as np\nfrom PIL import Image\nimage = Image.open(r'E:\\Mercy\\data\\mytest\\star.png')\ngraph = np.array(image)"
from wordcloud import WordCloudwc = WordCloud( font_path= "C:\Windows\Fonts\STHUPO.TTF" , background_color= "white" , max_words= 1600 , mask= graph, contour_width= 0 , contour_color= 'steelblue' ) . generate( stayed_line)
import matplotlib. pyplot as plt
plt. imshow( wc, interpolation= "bilinear" )
plt. axis( "off" )
wc. to_file( r'E:\Mercy\data\mytest\wc_cn.png' )
<wordcloud.wordcloud.WordCloud at 0x18f08300520>
读取db文件
import sqlite3
conn= sqlite3. connect( 'message.db' )
c= conn. cursor( )
tables= c. execute( "SELECT name FROM sqlite_master WHERE type='table';" ) . fetchall( )
print ( tables)
[]
c. execute( "select * from table" ) . fetchall( )
c. close( )
conn. close( )
---------------------------------------------------------------------------OperationalError Traceback (most recent call last)<ipython-input-7-e4488a0f934e> in <module>1 #正常执行SQL查询
----> 2 c.execute("select * from table").fetchall()3 4 #关闭5 c.close()OperationalError: near "table": syntax error