# -*- coding: utf-8 -*-
import pandas as pd
import os
import datetime
import copy
#0,1,2, ,3,4,5,6
def pcgen(dz): #0,1,2,3,4,5,6,7
ar_f=pd.read_excel(dz,sheet_name='PC CC Master',usecols='A,O,Q,R,S,T,W,Y', \
names=['A','B','C','D','E','F','G','H'], \
dtype={'A':str,'B':str,'C':str,'D':str,'E':str,'F':str,'G':str,'H':str}).fillna('')
ar=ar_f.values.tolist()
dic={}
for arr in ar:
if len(arr[0])==8 and arr[0][0]=='1':
dic[arr[0]]=[arr[2],arr[4],arr[5],arr[6],arr[7],arr[3]]
return dic
def stfgen(dz):
ar_f=pd.read_excel(dz,dtype=dict.fromkeys(list(pd.read_excel(dz).columns),'str')).fillna('')
ar=ar_f.values.tolist()
dic={}
for i in ar:
dic[i[0]]=i[5]
return dic
def mapgen(dz):
ar_f=pd.read_excel(dz).fillna('')
ar=ar_f.values.tolist()
return ar
def stfggen(dz):
ar_f=pd.read_excel(dz,dtype=dict.fromkeys(list(pd.read_excel(dz).columns),'str')).fillna('')
ar=ar_f.values.tolist()
dic={}
dicm={}
for k in ar:
if '0' in k[1] or '1' in k[1] or '2' in k[1] or '3' in k[1] \
or '4' in k[1] or '5' in k[1] or '6' in k[1] or '7' in k[1] \
or '8' in k[1] or '9' in k[1]:
dic[k[1].strip()]=[k[27],k[2]]
dicm[k[1].strip()]=[k[12],k[18]]
return dic,dicm
def bwread(dz):
out=[]
with open(dz,'r',encoding='UTF-8') as filein:
art=[]
while True:
txt=filein.readline()
if not txt:
break
if txt[:3]==r'<tr':
art=[]
jj=0
if txt[:13]==r'<td class="3Dx"':
if r'x:num=3D' in txt:
a=txt.find(r'"')+1
b=txt.find(r'"',a)
art.append(float(txt[a:b]))
jj+=1
else:
a=txt.find(r'>')+1
b=txt.find(r'</td>',a)
tt=txt[a:b]
tt=tt.replace('4700/', '')
tt=tt.replace('<br>', ' ')
tt=tt.replace(' ', ' ')
tt=tt.replace('<','<')
tt=tt.replace('>','>')
tt=tt.replace('&','&')
tt=tt.replace('&;','&')
tt=tt.replace(' ',' ')
tt=tt.strip()
art.append(tt)
jj+=1
if r'colspan' in txt:
a=txt.find(r'colspan')+10
b=txt.find(r' ',a+1)
for i in range(int(txt[a:b])-1):
art.append("")
jj+=1
if txt[:5]==r'</tr>':
if len(art)>0:
out.append(art)
return out
def bwfolderread(dz):
arr=[]
for rr,dd,ff in os.walk(dz):
pass
for f in ff:
dz_t=os.path.join(rr,f)
ar_t=bwread(dz_t)
arr+=ar_t
return arr
#################################################################################################
def clmgen(ar,art):
dicd={}
dicn={}
for i in ar:
if len(i)>=5 and i[0]!='' and i[0]!='period':
if i[3] not in dicd.keys():
dicd[i[3]]=[]
dicn[i[3]]=[[],[],0,0]
if i[4] not in dicd[i[3]]:
dicd[i[3]].append(i[4])
dicn[i[3]][0].append(i[4])
dicn[i[3]][1].append(float(i[13]))
dicn[i[3]][2]+=1
dicn[i[3]][3]+=float(i[13])
for k in dicd.keys():
kk=0
ard=dicd[k]
for i in ard:
if i in art:
kk+=1
if kk>1:
# =============================================================================
# print(dicd[k])
# print(dicn[k])
# print('-------')
# =============================================================================
artemp=[0 for i in art]
for i in range(len(dicn[k][0])):
if dicn[k][0][i] in art:
kt=art.index(dicn[k][0][i])
artemp[kt]+=dicn[k][1][i]
ktt=0
for i in range(1,len(artemp)):
if artemp[i]>artemp[ktt]:
ktt=i
for i in range(len(dicd[k])-1,-1,-1):
if dicd[k][i] in art:
if dicd[k][i]!=art[ktt]:
dicd[k].pop(i)
for i in range(len(dicn[k][0])):
if dicn[k][0][i] in art:
dicn[k][0][i]=art[ktt]
# =============================================================================
# print(dicd[k])
# print(dicn[k])
# print('-------------------')
# =============================================================================
# =============================================================================
# for k in dicd.keys():
# if len(dicd[k])>1:
# print(k)
# print(dicd[k])
# print(dicn[k])
# print('--------------------------')
# =============================================================================
return dicd,dicn
def vat_data_process(df,dicd,dicn,dicp,dics,dicstm,dfm,arst,dic_te):
ar=df.values.tolist()
art=df.columns.tolist()
k=art.index('货物或应税劳务、服务名称')
k2=art.index('条形码')
k3=art.index('发票代码')
k4=art.index('CNY')
k5=art.index('Commodity')
arr=[]
dicn_kt={}
for i in ar:
###################################################
if '*' in i[k]:
artt=i[k].split('*')
elif '(' in i[k]:
artt=i[k].split('(')
elif '(' in i[k]:
artt=i[k].split('(')
else:
artt=[i[k]]
dictt={}
arttt=[]
for t in range(len(artt)):
artt[t]=artt[t].strip()
if artt[t]!='' \
and len(artt[t])<=12 \
and '0' not in artt[t] and '1' not in artt[t] and '2' not in artt[t] \
and '3' not in artt[t] and '4' not in artt[t] and '5' not in artt[t] \
and '6' not in artt[t] and '7' not in artt[t] and '8' not in artt[t] \
and '9' not in artt[t] and '[' not in artt[t] and '【' not in artt[t]:
if artt[t] not in dictt.keys():
dictt[artt[t]]=t
arttt.append(artt[t])
if arttt==[]:
t=i[k]
if '房' in t and '租' in t:
arttt.append('经营租赁')
arttt.append('房租')
elif '租金' in t:
arttt.append('经营租赁')
arttt.append('房租')
elif '租车' in t:
arttt.append('经营租赁')
arttt.append('租车')
elif 'A3' in t or 'A4' in t:
arttt.append('纸制品')
elif '笔' in t:
arttt.append('文具')
else:
arttt=[i[k]]
arttt.append('')
arttt.append('')
if '房' in arttt[0] and '租' in arttt[0]:
arttt[0]='经营租赁'
arttt[1]='房租'
###################################################
bn=i[k2][8:18]
if bn not in dicn_kt.keys():
dicn_kt[bn]=[[],[],0,0]
dicn_kt[bn][1].append(i[k4])
dicn_kt[bn][2]+=1
dicn_kt[bn][3]+=i[k4]
sn=i[k2][:8]
fr=i[k3][:4]
if sn in dicstm.keys():
sg=dicstm[sn][0]
pc='1'+dicstm[sn][1][1:]
else:
sg=''
pc=''
if pc in dicp.keys():
pc_s=dicp[pc][2]
pc_r=dicp[pc][4]
else:
pc_s=''
pc_r=''
arr.append(i)
arr[-1].append(arttt[0])
arr[-1].append(arttt[1])
arr[-1].append(arttt[2])
arr[-1].append(sn)
arr[-1].append(fr)
arr[-1].append(sg)
arr[-1].append(pc)
arr[-1].append(pc_s)
arr[-1].append(pc_r)
art+=['type1','type2','type3','staff code','发票代码region','Grade','PC','Service','Region']
#print(dicp)
#print(dics)
#print(dicstm)
df=pd.DataFrame(arr,columns=art)
df=model_use(df,dfm,1)
arr=df.values.tolist()
art=df.columns.tolist()
#print(dicn)
#print(dicn_kt)
for i in arr:
bn=i[k2][8:18]
i.append('')
i.append('')
if bn in dicd.keys():
if len(dicd[bn])==1:
i[-2]='1v1'
i[-1]=dicd[bn][0]
if i[-1]=='':
if bn in dicn.keys():
if dicn[bn][2]==dicn_kt[bn][2] and dicn[bn][3]==dicn_kt[bn][3]:
if i[k4] in dicn[bn][1]:
i[-2]='amount matched'
i[-1]=dicn[bn][0][dicn[bn][1].index(i[k4])]
if i[-1]=='':
if i[-3]!='':
i[-2]='model generate'
i[-1]=i[-3]
if i[-3] in arst:
if bn in dicd.keys():
for k in dicd[bn]:
if k in arst:
if k!=i[-1]:
i[-2]='special type'
i[-1]=k
if i[-1]!='':
i[k5]=dic_te[i[-1]]
art+=['claim type source','claim type']
df=pd.DataFrame(arr,columns=art)
return df
###############################################################################################
def modelread(dz):
df=pd.read_excel(dz,dtype=dict.fromkeys(list(pd.read_excel(dz).columns),'str')).fillna('')
ncn=df.columns.tolist()[-2:]
for k in df.columns.tolist():
if k in ncn:
df[k]=df[k].replace('','0')
df[k]=df[k].astype(float)
return df
def model_use(dfd,dfm,optim): #optim 1 compress model 0 not compress
art=dfd.columns.tolist()
ard=dfd.values.tolist()
arm=dfm.values.tolist()
for i in ard:
i.append('')
for m in arm:
if typefind(i,m,art)==True:
i[-1]=m[-3]
break
if optim==1:
while True:
arm=model_compress(arm)
for i in ard:
if i[-1]=='':
for m in arm:
if typefind(i,m,art)==True:
i[-1]=m[-3]
break
if len(arm[0])==5:
break
art.append('guess_type')
df=pd.DataFrame(ard,columns=art)
return df
def typefind(ard,arm,art):
for m in range(len(arm)-3):
if m % 2 == 0 :
if arm[m]!='':
try:
j=art.index(arm[m])
except:
return False
else:
break
elif m % 2 ==1:
k=arm[m]
if len(k)>=2:
if k[:2]=='>=' or \
(k[0]=='<' and k[1] in '-1234567890'):
ar_temp=k.split(' and ')
if len(ar_temp)==2:
kk1=float(ar_temp[0][2:])
kk2=float(ar_temp[1][1:])
if kk1>=ard[j] or kk2<ard[j]:
return False
elif len(ar_temp)==1:
if ar_temp[0][:2]=='>=':
kk=float(ar_temp[0][2:])
if kk>=ard[j]:
return False
elif ar_temp[0][:1]=='<':
kk=float(ar_temp[0][1:])
if kk<ard[j]:
return False
else:
if k!=ard[j]:
return False
else:
if k!=ard[j]:
return False
return True
def model_compress(arm):
dic={}
t=len(arm[0])-3
for i in arm:
if tuple(i[:t-2]+[i[-3]]) not in dic.keys():
dic[tuple(i[:t-2]+[i[-3]])]=0
dic[tuple(i[:t-2]+[i[-3]])]+=i[-1]
dic2={}
for k in dic.keys():
if k[:t-2] not in dic2.keys():
dic2[k[:t-2]]=['',0,0]
if dic[k] > dic2[k[:t-2]][2]:
dic2[k[:t-2]]=[k[-1],0,dic[k]]
ar=[]
for k in dic2.keys():
ar.append(list(k)+dic2[k])
return ar
#################################################################################################
def datetrim(dt):
dd=dt.strip()
return dd[:10]
def vatairgen(dz,dicp,dics,arm,dicd,dicn,dicstm,dfm,arst,dic_te):
ar_f1=pd.read_excel(dz,sheet_name='vat', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='vat').columns),'str')).fillna('')
ar_f1=ar_f1[['发票唯一代号','条形码','发票代码','发票号码','开票日期','合计不含税价款(元)',
'合计税额(元)','价税合计金额(元)','销售方纳税人识别号','销售方开户行及账号',
'销售方名称','销售方地址、电话','购买方纳税人识别号','购买方名称','发票类型',
'是否重票录入','操作日期','货物或应税劳务、服务名称','规格型号','单位',
'数量','单价(元)','税率(%)','调整后不含税价款(元)','调整后税款(元)']]
ar_f1.columns=['发票唯一代号','条形码','发票代码','发票号码','开票日期','合计不含税价款(元)',
'合计税额(元)','价税合计金额(元)','销售方纳税人识别号','销售方开户行及账号',
'Vendor name','销售方地址、电话','购买方纳税人识别号','购买方名称','发票类型',
'是否重票录入','Posting Date','货物或应税劳务、服务名称','规格型号','单位',
'数量','单价(元)','税率(%)','不含税价款(元)','税额(元)']
art_f1=ar_f1.columns.tolist()
ar_f1['Source']='VAT'
ar_f1['Commodity']='' #ar_f1['货物或应税劳务、服务名称'].apply(catamatch,args=(arm,))
ar_f1['不含税价款(元)']=ar_f1['不含税价款(元)'].astype('float')
ar_f1['税额(元)']=ar_f1['税额(元)'].astype('float')
ar_f1['CNY']=ar_f1['不含税价款(元)']+ar_f1['税额(元)']
ar_f1['Posting Date']=ar_f1['Posting Date'].apply(datetrim)
#print(ar_f1)
ar_f1=vat_data_process(ar_f1,dicd,dicn,dicp,dics,dicstm,dfm,arst,dic_te)
ar_f1_out=copy.copy(ar_f1)
ar_f1=ar_f1[art_f1+['Source','Commodity','CNY']]
#print(ar_f1)
ar_f2=pd.read_excel(dz,sheet_name='air', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='air').columns),'str')).fillna('')
ar_f2=ar_f2[['发票唯一代号','条形码','电子客票号码','填开日期','销售单位代号','填开单位',
'总额','操作日期']]
ar_f2.drop_duplicates(subset=['电子客票号码'], keep='first',inplace=True)
ar_f2.columns=['发票唯一代号','条形码','发票号码','开票日期','销售方纳税人识别号',
'Vendor name','价税合计金额(元)','Posting Date']
ar_f2['Source']='Air'
ar_f2['Commodity']='Air'
ar_f2['CNY']=ar_f2['价税合计金额(元)']
ar_f2['Posting Date']=ar_f2['Posting Date'].apply(datetrim)
df=pd.concat([ar_f1,ar_f2], axis=0).fillna('')
ar=df.values.tolist()
art=df.columns.tolist()
for i in ar:
if i[art.index('Vendor name')]=='':
i[art.index('Vendor name')]='No Vendor'
#i[art.index('Posting Date')]=i[art.index('Posting Date')][:10]
k=i[art.index('条形码')]
if len(k)!=20 or k.isdigit()==False:
i.append('')
i.append('')
i.append('')
i.append('')
i.append('')
else:
i.append(k[:8])
i.append(k[8:])
if i[-2] in dics.keys():
i.append(dics[i[-2]][-8:])
else:
i.append('')
kk='1' + i[-1][-7:]
if i[-1]!='' and kk in dicp.keys():
i.append(dicp[kk][3])
i.append(dicp[kk][5])
else:
i.append('')
i.append('')
art.append('Staff Num')
art.append('Claim Num')
art.append('Requestor PC')
art.append('Requestor SG')
art.append('Requestor Office')
df=pd.DataFrame(ar,columns=art)
return df,ar_f1_out
def vatairout(dz,ary,dicp,dics,arm,vmdz,dicd,dicn,dicstm,dfm,arst,dic_te,dzvat): #dzo,
# =============================================================================
# if os.path.exists(dzo)==True:
# os.remove(dzo)
# =============================================================================
dic_out={}
k=0
for yyy in ary:
dz_y=os.path.join(dz,yyy)
ff=os.listdir(dz_y)
for f in ff:
dz_t=os.path.join(dz_y,f)
df,df_vat=vatairgen(dz_t,dicp,dics,arm,dicd,dicn,dicstm,dfm,arst,dic_te)
df=vat_data_tag(df,vmdz)
#df.to_excel('test.xlsx',index=False)
#dzoo=os.path.join(dzo,'vatair' + str(k) + ' ' + f[:f.rindex('.')] + '.xlsx') ################################
dzov=os.path.join(dzvat,'vat' + str(k) + ' ' + f[:f.rindex('.')] + '.xlsx')
#dftotxt(df,dzoo,k)
#df.to_excel(dzoo,index=False) ########################################
df_vat.to_excel(dzov,index=False)
dic_out[str(k) + ' ' + f[:f.rindex('.')] + '.xlsx']=df
#print('vatair',f,'Outputed') ######################################
k+=1
return dic_out
def othergen(dz,dicp,dics):
ar_f1=pd.read_excel(dz,sheet_name='quota', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='quota').columns),'str')).fillna('')
ar_f1=ar_f1[['批次号','发票唯一代号','条形码','发票代码','发票号码','金额(元)','录入时间',
'人工备注','图片原件','源文件','操作员','操作日期']]
ar_f1.columns=['批次号','发票唯一代号','条形码','发票代码','发票号码','金额(元)','录入时间',
'人工备注','图片原件','源文件','操作员','Posting Date']
ar_f1['Source']='Fixed Invoice'
ar_f1['Commodity']='Others'
ar_f1['Posting Date']=ar_f1['Posting Date'].apply(datetrim)
ar_f2=pd.read_excel(dz,sheet_name='train', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='train').columns),'str')).fillna('')
ar_f2=ar_f2[['批次号','发票唯一代号','条形码','发票号码','金额(元)','乘车日期','乘车时间',
'乘车人姓名','车次','座位级别','始发站','终点站','录入时间','人工备注',
'图片原件','源文件','操作员','操作日期']]
ar_f2.columns=['批次号','发票唯一代号','条形码','发票号码','金额(元)','乘车日期','乘车时间',
'乘车人','车次','座位级别','始发站','终点站','录入时间','人工备注',
'图片原件','源文件','操作员','Posting Date']
ar_f2['Source']='Train'
ar_f2['Commodity']='Travel Miscellaneous'
ar_f2['Posting Date']=ar_f2['Posting Date'].apply(datetrim)
ar_f3=pd.read_excel(dz,sheet_name='taxi', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='taxi').columns),'str')).fillna('')
ar_f3=ar_f3[['批次号','发票唯一代号','条形码','发票代码','发票号码','金额(元)','乘车日期',
'里程','上车时间','下车时间','发票所在地','录入时间', '人工备注','图片原件',
'源文件','操作员','操作日期']]
ar_f3.columns=['批次号','发票唯一代号','条形码','发票代码','发票号码','金额(元)','乘车日期',
'里程','上车时间','下车时间','发票所在地','录入时间', '人工备注','图片原件',
'源文件','操作员','Posting Date']
ar_f3['Source']='Taxi'
ar_f3['Commodity']='Travel Miscellaneous'
ar_f3['Posting Date']=ar_f3['Posting Date'].apply(datetrim)
ar_f4=pd.read_excel(dz,sheet_name='bus', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='bus').columns),'str')).fillna('')
ar_f4=ar_f4[['批次号','发票唯一代号','条形码','发票代码','发票号码','日期','时间','出发车站',
'到达车站','总金额','姓名','录入时间','人工备注','图片原件','源文件','操作员','操作日期']]
ar_f4.columns=['批次号','发票唯一代号','条形码','发票代码','发票号码','乘车日期','乘车时间','始发站',
'终点站','金额(元)','乘车人','录入时间','人工备注','图片原件','源文件','操作员','Posting Date']
ar_f4['Source']='Car'
ar_f4['Commodity']='Travel Miscellaneous'
ar_f4['Posting Date']=ar_f4['Posting Date'].apply(datetrim)
ar_f5=pd.read_excel(dz,sheet_name='other', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='other').columns),'str')).fillna('')
ar_f5=ar_f5[['批次号','发票唯一代号','条形码','发票代码','发票号码','总金额','录入时间','人工备注',
'图片原件','源文件','操作员','操作日期']]
ar_f5.columns=['批次号','发票唯一代号','条形码','发票代码','发票号码','金额(元)','录入时间','人工备注',
'图片原件','源文件','操作员','Posting Date']
ar_f5['Source']='Other'
ar_f5['Commodity']='Others'
ar_f5['Posting Date']=ar_f5['Posting Date'].apply(datetrim)
ar_f6=pd.read_excel(dz,sheet_name='uncertain', \
dtype=dict.fromkeys(list(pd.read_excel(dz,sheet_name='uncertain').columns),'str')).fillna('')
ar_f6=ar_f6[['批次号','发票唯一代号','条形码','发票代码','发票号码','开票日期','不含税金额','发票类型',
'乘车人','录入时间','错误原因','图片原件','源文件','操作员','操作日期']]
ar_f6.columns=['批次号','发票唯一代号','条形码','发票代码','发票号码','开票日期','金额(元)','发票类型',
'乘车人','录入时间','人工备注','图片原件','源文件','操作员','Posting Date']
ar_f6['Source']='Pending'
dic_m={'增值税普通发票(纸质票)':'Others',
'其他发票':'Others',
'增值税专用发票':'Others',
'定额发票':'Others',
'火车票':'Travel Miscellaneous',
'客运汽车':'Travel Miscellaneous',
'出租车发票':'Travel Miscellaneous',
'航空运输电子客票行程单':'Air'}
ar_f6['Commodity']=ar_f6['发票类型'].map(dic_m)
ar_f6['Posting Date']=ar_f6['Posting Date'].apply(datetrim)
df=pd.concat([ar_f1,ar_f2,ar_f3,ar_f4,ar_f5,ar_f6], axis=0).fillna('')
ar=df.values.tolist()
art=df.columns.tolist()
for i in ar:
k=i[art.index('条形码')]
if len(k)!=20 or k.isdigit()==False:
i.append('')
i.append('')
i.append('')
i.append('')
i.append('')
else:
i.append(k[:8])
i.append(k[8:])
if i[-2] in dics.keys():
i.append(dics[i[-2]][-8:])
else:
i.append('')
kk='1' + i[-1][-7:]
if i[-1]!='' and kk in dicp.keys():
i.append(dicp[kk][3])
i.append(dicp[kk][5])
else:
i.append('')
i.append('')
art.append('Staff Num')
art.append('Claim Num')
art.append('Requestor PC')
art.append('Requestor SG')
art.append('Requestor Office')
df=pd.DataFrame(ar,columns=art)
return df
####数据加tag
def vat_data_tag(data,dz):
df=pd.read_excel(dz).fillna('')
ar=df.values.tolist()
art=df.columns.tolist()
dic={}
for j,k in enumerate(art):
dic[k]=''
for r in ar:
if r[j]!='':
dic[k]+=(r[j]+'|')
dic[k]=dic[k][:len(dic[k])-1]
#print(dic)
###########筛选出hotel数据
contn_join = dic['hotel contn']#'住宿|房费|场地|房租|房屋|租房|住房|私房出租|场地|租金|会展服务*场地费|经营租赁'
nocontn_join = dic['hotel nocontn']#'洗衣|其他|电话|杂费|洗涤|迷你吧|餐费|餐饮|打印|通行|有形动产|车|会议|体育|车|浙|花卉|礼服|电脑'
data.loc[(data['货物或应税劳务、服务名称'].astype(str).str.contains(contn_join)) \
&(~data['货物或应税劳务、服务名称'].astype(str).str.contains(nocontn_join)) \
&(data['Source']=='VAT'), \
'发票货物服务类型'] \
='hotel'
#print(data)
####筛选出餐饮数据
# =============================================================================
# contn = '餐饮|餐费|乳制品|食品|饮品|茶歇|奶茶|零食|坚果|水果|饮料|肉|蔬菜|酒|茶|\
# 豆制品|谷物|蛋制品|调味品|水产|畜禽|淀粉|食用菌|植物油|干豆|\
# 糖|薯豆|发酵类制品|薯类|盒马|香料原料|盐|果类|罐头|咖啡|星巴克|麦当劳|京东|天猫|超市'
# nocontn_join = '家具|酒店|住宿|体育用品|休闲用品|化学用品|咨询服务|医疗|医药|印刷品|电池|塑料制品|\
# 护理品|家用|工艺品|文具|日用|玻璃器皿'
# =============================================================================
contn_join = dic['meal contn']
nocontn_join = dic['meal nocontn']
data.loc[(data['货物或应税劳务、服务名称'].astype(str).str.contains(contn_join)) \
&(~data['货物或应税劳务、服务名称'].astype(str).str.contains(nocontn_join)) \
&(data['Source']=='VAT'), \
'发票货物服务类型'] \
='meal'
#print(data)
######读取交通费
####################从服务名称筛选出客运服务费
# =============================================================================
# contn = '出租车运输服务|出租汽车客运服务|网约车交通出行费|运输服务客运服务|出租车客运服务|\
# 代订车服务费|约车服务费|客运服务费|\*运输服务\*交通运输服务|\
# \*运输服务\*客运服务|\*信息技术服务\*用车平台服务费|用车服务费|网约车|\
# \*运输服务\*出行服务|\*运输服务\*运输服务费'
# nocontn_join = '\*运输服务\*地铁客运服务费|\*运输服务\*定制公交客运服务费'
# =============================================================================
contn_join = dic['car contn']
nocontn_join = dic['car nocontn']
data.loc[(data['货物或应税劳务、服务名称'].astype(str).str.contains(contn_join)) \
&(~data['货物或应税劳务、服务名称'].astype(str).str.contains(nocontn_join)) \
&(data['Source']=='VAT'), \
'发票货物服务类型'] \
='carhailing'
#print(data)
####加油费停车费
data.loc[(data['货物或应税劳务、服务名称'].str.contains(dic['park contn'])) \
&(data['Source']=='VAT') ,\
'发票货物服务类型']='停车费' #"车辆停放|停车|临停|泊车|车位"
data.loc[data['货物或应税劳务、服务名称'].str.contains(dic['gas contn']) \
&(data['Source']=='VAT') ,\
'发票货物服务类型']='加油费' #"加油|汽油|柴油"
###others
data.loc[(data['Source']=='VAT')&(data['发票货物服务类型'].isna()), \
'发票货物服务类型']='others'
data.loc[data['Source']!='VAT','发票货物服务类型']=''
return data
def otherout(dz,ary,dicp,dics): #dzo,
# =============================================================================
# if os.path.exists(dzo)==True:
# os.remove(dzo)
# =============================================================================
dic_out={}
k=0
for yyy in ary:
dz_y=os.path.join(dz,yyy)
ff=os.listdir(dz_y)
for f in ff:
dz_t=os.path.join(dz_y,f)
df=othergen(dz_t,dicp,dics)
#dzoo=os.path.join(dzo,'other' + str(k) + ' ' + f[:f.rindex('.')] + '.xlsx') ##################################
#df.to_excel('testo.xlsx',index=False)
#dftotxt(df,dzoo,k)
#df.to_excel(dzoo,index=False) ##################################
dic_out[str(k) + ' ' + f[:f.rindex('.')] + '.xlsx']=df
#print('other',f,'Outputed') #############################
k+=1
return dic_out
def dftotxt(df,dz,k):
# =============================================================================
# if k==0:
# ar=df.values.tolist()
# art=[df.columns.tolist()]
# arr=art+ar
# else:
# ar=df.values.tolist()
# arr=ar
# =============================================================================
ar=df.values.tolist()
art=[df.columns.tolist()]
arr=art+ar
with open(dz,'w',encoding='UTF-8') as fo:
for i in arr:
k=''
for j in i:
if k=='':
k=str(j)
else:
k+=('//////'+str(j))
k+='\n'
fo.write(k)
def allcombine(dic_av,dic_o,dicsg,dz_o,dz_v):
arsg=[]
for k in dicsg.keys():
arsg.append([k,dicsg[k][0],dicsg[k][1]])
dfsg=pd.DataFrame(arsg,columns=['Staff Num','Staff Grade','Staff name'])
ar_vendor=pd.read_excel(dz_v).fillna('').values.tolist()
for k in dic_o.keys():
if k in dic_av.keys():
df_av=dic_av[k]
df_o=dic_o[k]
df_o.rename(columns={'金额(元)':'CNY'},inplace=True)
df_o.loc[df_o['Source']=='Train','Vendor name']='中国铁道网络有限公司'
df_o.loc[df_o['Source']!='Train','Vendor name']='No Vendor'
dfa=pd.concat([df_o,df_av])
dfa['CNY']=dfa['CNY'].astype(float)
df=pd.merge(dfa,dfsg,how='left',on='Staff Num')
dz=os.path.join(dz_o,'all' + k)
df.to_excel(dz,index=False)
ar=df['Vendor name'].values.tolist()
for i in ar:
if [i] not in ar_vendor:
ar_vendor.append([i])
print('all',k,'Outputed')
df=pd.DataFrame(ar_vendor,columns=['Vendor name'])
df.to_excel(dz_v,index=False)
bg_dt = datetime.datetime.now()
print('start-----')
#yys=input(r'Please input YEARS separated by comma:')
yys='202107'
ary=yys.strip().split(',')
root_dz=r'G:\Management team(Shared)\Data Analytical\08. Vendor Payment\supporting\Procurement - sitong\4. procurement Python\Ktec'
ktac_dz=os.path.join(root_dz,r'K-tec combine')
#out_dz_air=os.path.join(root_dz,r'air data')
#out_dz_other=os.path.join(root_dz,r'other data')
out_dz_all=os.path.join(root_dz,r'all')
out_dz_vat=os.path.join(root_dz,r'vat')
map_dz=os.path.join(root_dz,r'4.Ktec mapping for python - final.xlsx')
vat_map_dz=os.path.join(root_dz,r'vat type mapping.xlsx')
vendor_map_dz=os.path.join(root_dz,r'vendor mapping.xlsx')
pc_dz=r'G:\Management team(Shared)\Data Analytical\08. Vendor Payment\supporting\Procurement - sitong\4. procurement Python\Ktec\PC CC master (valued).xlsx'
stf_dz=r'\\cnbjsfsr550\fingroup$\VBA\REFERENCES\staff email\KAR staff list.xlsx'
stfg_dz=r'G:\Management team(Shared)\Data Analytical\08. Vendor Payment\supporting\Procurement - sitong\4. procurement Python\Ktec\2024.10_KPI Staff List to Finance.xlsx'
model_dz=r'G:\Management team(Shared)\Data Analytical\08. Vendor Payment\supporting\Procurement - sitong\4. procurement Python\Ktec\max_model using.xlsx'
dic_te={'01 Per diem':'delete',
'02 Trip - Meal':'Meal',
'03 OT meal (non-taxable)':'Meal',
'04 OT meal (taxable)':'Meal',
'05 ENT-client,target&busi asso':'Meal',
'06 ENT-staff entertainment':'Meal',
'07 Hotel&ac w SpecialVATfapiao':'Lodging',
'08 Hotel & accommodation':'Lodging',
'09 Trip-Air-domestic 差旅费-国内机票':'Air',
"10 Trip-Air-int'l 差旅费-国际机票":'Air',
'14 Trip - Airfares':'Air',
'15 Trip - Non airfares':'Travel Miscellaneous',
'16 Trip - HK taxi':'Travel Miscellaneous',
'17Trip-excl. airfare & HK taxi':'Travel Miscellaneous',
'18Gift-client,target&busi asso':'Marketing',
'19 Gift - staff':'Gift',
'20 Membership fee':'Subscriptions & Membership & Sponsorship',
'21 Course & exam fee':'Compensation & Benefits',
'22 Telephone & communication':'Telecommunication & Audio/Visual Conference & Associated Service',
'23 Ref book & publication subs':'Others',
'24 Postage & courier':'Courier, Express & Parcel',
'25 Medical (Partner&Director)':'Compensation & Benefits',
'26 Medical (Grade A to D)':'Compensation & Benefits',
'27 Medical (Support Grade)':'Compensation & Benefits',
'28 Office supplies':'Office Supplies',
'29 HR miscellaneous':'Compensation & Benefits',
'30 OT meal (Meituan)':'Meal',
'31 Trip-Non airfares (Meituan)':'Travel Miscellaneous',
'98 Cash advance':'delete',
'99 Others':'Others'}
ar_special_type=['02 Trip - Meal','03 OT meal (non-taxable)','04 OT meal (taxable)',
'05 ENT-client,target&busi asso','06 ENT-staff entertainment']
######################################################################
clm_dz=os.path.join(root_dz,r'claimmonitoring for ktec')
ar_clm=bwfolderread(clm_dz)
dic_clm_des,dic_clm_num=clmgen(ar_clm,ar_special_type)
######################################################################
dic_pc=pcgen(pc_dz)
#print(dic_pc)
dic_stf=stfgen(stf_dz)
dic_stfg,dic_stfg_m=stfggen(stfg_dz)
df_model=modelread(model_dz)
ar_map=mapgen(map_dz)
dic_vatair=vatairout(ktac_dz,ary,dic_pc,dic_stf,ar_map,vat_map_dz,dic_clm_des,dic_clm_num,dic_stfg_m,df_model,
ar_special_type,dic_te,out_dz_vat) #out_dz_air,
dic_other=otherout(ktac_dz,ary,dic_pc,dic_stf) #out_dz_other,
allcombine(dic_vatair,dic_other,dic_stfg,out_dz_all,vendor_map_dz)
#df_plan['Date']=pd.to_datetime(df_plan['Date']).dt.date
diff=datetime.datetime.now()-bg_dt
diff_s=diff.days*24*60*60 + diff.seconds
print(diff_s)
print('Finished')
标签:art,df,ktacgen,dic,ar,dz,append From: https://www.cnblogs.com/sitongyan/p/18562096