# -*- coding: utf-8 -*-
import pandas as pd
import os
import datetime
def bwread(dz):
out=[]
with open(dz,'r',encoding='UTF-8') as filein:
art=[]
while True:
txt=filein.readline()
if not txt:
break
if txt[:3]==r'<tr':
art=[]
jj=0
if txt[:13]==r'<td class="3Dx"':
if r'x:num=3D' in txt:
a=txt.find(r'"')+1
b=txt.find(r'"',a)
art.append(float(txt[a:b]))
jj+=1
else:
a=txt.find(r'>')+1
b=txt.find(r'</td>',a)
tt=txt[a:b]
tt=tt.replace('4700/', '')
tt=tt.replace('<br>', ' ')
tt=tt.replace(' ', ' ')
tt=tt.replace('<','<')
tt=tt.replace('>','>')
tt=tt.replace('&','&')
tt=tt.replace('&;','&')
tt=tt.replace(' ',' ')
tt=tt.strip()
art.append(tt)
jj+=1
if r'colspan' in txt:
a=txt.find(r'colspan')+10
b=txt.find(r' ',a+1)
for i in range(int(txt[a:b])-1):
art.append("")
jj+=1
if txt[:5]==r'</tr>':
if len(art)>0:
out.append(art)
return out
def bwfolderread(dz):
arr=[]
for rr,dd,ff in os.walk(dz):
pass
for f in ff:
dz_t=os.path.join(rr,f)
ar_t=bwread(dz_t)
arr+=ar_t
return arr
def clmgen(ar):
arr=[]
dic={}
dic2={}
dic_out={}
dic_out2={}
for i in ar:
if len(i)>=5 and i[0]!='' and i[0]!='Employee':
arr.append(i)
if arr[-1][3] not in dic.keys():
dic[arr[-1][3]]=0
dic[arr[-1][3]]+=1
if arr[-1][3]+arr[-1][5] not in dic2.keys():
dic2[arr[-1][3]+arr[-1][5]]=0
dic2[arr[-1][3]+arr[-1][5]]+=1
for i in arr:
i.append(dic[i[3]])
i.append(dic2[i[3]+i[5]])
if i[-1]==i[-2]:
dic_out[i[3]]=i[5]
dic_out2[i[3]]=i[0]
art=['Employee','Client','Client Name','Claim','Claim Item','Claim Type','Engagement',
'Engagement name','Internal Order','Internal Order name','Receipt amount',
'count_claim','count_claim_type']
df=pd.DataFrame(arr,columns=art)
return df,dic_out,dic_out2
def ktecgen(dz,dicc,dics):
ff=os.listdir(dz)
dic={}
for f in ff:
dz_t=os.path.join(dz,f)
df_t=pd.read_excel(dz_t,sheet_name='vat',
dtype=dict.fromkeys(list(pd.read_excel(dz_t,sheet_name='vat').columns),'str')) \
.fillna('')
dic[f]=df_t
df_all = pd.concat([i for i in dic.values()])
art=['发票唯一代号','条形码','发票代码','发票号码','购买方纳税人识别号','购买方开户行及账号',
'购买方名称','发票备注栏','人工备注','发票类型','货物或应税劳务、服务名称','规格型号',
'单位','数量','不含税价款(元)','税额(元)']
df=df_all.loc[df_all['check']=='0']
df=df[art]
ar=df.values.tolist()
arr=[]
arro=[]
for i in ar:
if '*' in i[10]:
artt=i[10].split('*')
elif '(' in i[10]:
artt=i[10].split('(')
elif '(' in i[10]:
artt=i[10].split('(')
else:
artt=[i[10]]
dictt={}
arttt=[]
for t in range(len(artt)):
artt[t]=artt[t].strip()
if artt[t]!='' \
and len(artt[t])<=12 \
and '0' not in artt[t] and '1' not in artt[t] and '2' not in artt[t] \
and '3' not in artt[t] and '4' not in artt[t] and '5' not in artt[t] \
and '6' not in artt[t] and '7' not in artt[t] and '8' not in artt[t] \
and '9' not in artt[t] and '[' not in artt[t] and '【' not in artt[t]:
if artt[t] not in dictt.keys():
dictt[artt[t]]=t
arttt.append(artt[t])
# =============================================================================
# for t in range(len(artt)-1,-1,-1):
# artt[t]=artt[t].strip()
# if artt[t] not in dictt.keys():
# dictt[artt[t]]=t
# if artt[t]=='':
# artt.pop(t)
# elif len(artt[t])>=12:
# artt.pop(t)
# elif '0' in artt[t] or '1' in artt[t] or '2' in artt[t] or '3' in artt[t] \
# or '4' in artt[t] or '5' in artt[t] or '6' in artt[t] or '7' in artt[t] \
# or '8' in artt[t] or '9' in artt[t]:
# artt.pop(t)
# =============================================================================
if arttt==[]:
t=i[10]
if '房' in t and '租' in t:
arttt.append('经营租赁')
arttt.append('房租')
elif '租金' in t:
arttt.append('经营租赁')
arttt.append('房租')
elif '租车' in t:
arttt.append('经营租赁')
arttt.append('租车')
elif 'A3' in t or 'A4' in t:
arttt.append('纸制品')
elif '笔' in t:
arttt.append('文具')
else:
arttt=[i[10]]
arttt.append('')
arttt.append('')
if i[1][8:18] in dicc.keys():
arr.append(i)
arr[-1].append(arttt[0])
arr[-1].append(arttt[1])
arr[-1].append(arttt[2])
arr[-1].append(dicc[i[1][8:18]])
arr[-1].append(dics[i[1][8:18]])
else:
arro.append(i)
arro[-1].append(arttt[0])
arro[-1].append(arttt[1])
arro[-1].append(arttt[2])
arro[-1].append('')
if i[1][8:18] in dics.keys():
arro[-1].append(dics[i[1][8:18]])
else:
arro[-1].append('')
art.append('type1')
art.append('type2')
art.append('type3')
art.append('claim type')
art.append('staff code')
df_model=pd.DataFrame(arr,columns=art)
df_other=pd.DataFrame(arro,columns=art)
art=['发票唯一代号','条形码','发票代码','发票号码',
'购买方名称','发票备注栏','人工备注','发票类型','货物或应税劳务、服务名称',
'type1','type2','type3',
'规格型号','单位','数量','不含税价款(元)','税额(元)','claim type','staff code']
df_model=df_model[art]
return df_model,df_other
bg_dt = datetime.datetime.now()
root_dz=r'C:\Users\lzh\101 claim type match'
clm_dz=os.path.join(root_dz,r'claimmonitoring')
ktec_dz=os.path.join(root_dz,r'ktec')
ar_clm=bwfolderread(clm_dz)
df_clm_all,dic_clm,dic_clm_s=clmgen(ar_clm)
df_clm=df_clm_all.loc[df_clm_all['count_claim']==df_clm_all['count_claim_type']]
df_model,df_other=ktecgen(ktec_dz,dic_clm,dic_clm_s)
print(df_model.shape)
print(df_other.shape)
dic_te={'01 Per diem':'delete',
'02 Trip - Meal':'meal',
'03 OT meal (non-taxable)':'meal',
'04 OT meal (taxable)':'meal',
'05 ENT-client,target&busi asso':'meal',
'06 ENT-staff entertainment':'meal',
'07 Hotel&ac w SpecialVATfapiao':'Lodging',
'08 Hotel & accommodation':'Lodging',
'14 Trip - Airfares':'Air',
'15 Trip - Non airfares':'Travel Miscellaneous',
'16 Trip - HK taxi':'Travel Miscellaneous',
'17Trip-excl. airfare & HK taxi':'Travel Miscellaneous',
'18Gift-client,target&busi asso':'marketing',
'19 Gift - staff':'Gift',
'20 Membership fee':'Subscriptions & Membership & Sponsorship',
'21 Course & exam fee':'Compensation & Benefits',
'22 Telephone & communication':'Telecommunication & Audio/Visual Conference & Associated Service',
'23 Ref book & publication subs':'Others',
'24 Postage & courier':'Courier, Express & Parcel',
'25 Medical (Partner&Director)':'Compensation & Benefits',
'26 Medical (Grade A to D)':'Compensation & Benefits',
'27 Medical (Support Grade)':'Compensation & Benefits',
'28 Office supplies':'Office Supplies',
'29 HR miscellaneous':'Compensation & Benefits',
'98 Cash advance':'delete',
'99 Others':'Others'}
#df_plan['Date']=pd.to_datetime(df_plan['Date']).dt.date
out_dz=os.path.join(root_dz,r'dataother.xlsx')
with pd.ExcelWriter(out_dz) as writer:
#df_model.to_excel(writer, sheet_name='ktec', index=False)
#df_clm.to_excel(writer, sheet_name='clm', index=False)
df_other.iloc[0:500000].to_excel(writer, sheet_name='ktec other', index=False)
df_other.iloc[500000:1000000].to_excel(writer, sheet_name='ktec other2', index=False)
diff=datetime.datetime.now()-bg_dt
diff_s=diff.days*24*60*60 + diff.seconds
print(diff_s)
print('Finished')