# -*- coding: utf-8 -*-
import pandas as pd
import datetime
def dataread(dz,ncn):
df=pd.read_excel(dz,dtype=dict.fromkeys(list(pd.read_excel(dz).columns),'str')).fillna('')
for k in df.columns.tolist():
if k in ncn:
df[k]=df[k].replace('','0')
df[k]=df[k].astype(float)
return df
def modelread(dz):
df=pd.read_excel(dz,dtype=dict.fromkeys(list(pd.read_excel(dz).columns),'str')).fillna('')
ncn=df.columns.tolist()[-2:]
for k in df.columns.tolist():
if k in ncn:
df[k]=df[k].replace('','0')
df[k]=df[k].astype(float)
return df
def model_use(dfd,dfm,optim): #optim 1 compress model 0 not compress
art=dfd.columns.tolist()
ard=dfd.values.tolist()
arm=dfm.values.tolist()
for i in ard:
i.append('')
for m in arm:
if typefind(i,m,art)==True:
i[-1]=m[-3]
break
if optim==1:
while True:
arm=model_compress(arm)
for i in ard:
if i[-1]=='':
for m in arm:
if typefind(i,m,art)==True:
i[-1]=m[-3]
break
if len(arm[0])==5:
break
art.append('guess_type')
df=pd.DataFrame(ard,columns=art)
return df
def typefind(ard,arm,art):
for m in range(len(arm)-3):
if m % 2 == 0 :
if arm[m]!='':
try:
j=art.index(arm[m])
except:
return False
else:
break
elif m % 2 ==1:
k=arm[m]
if len(k)>=2:
if k[:2]=='>=' or \
(k[0]=='<' and k[1] in '-1234567890'):
ar_temp=k.split(' and ')
if len(ar_temp)==2:
kk1=float(ar_temp[0][2:])
kk2=float(ar_temp[1][1:])
if kk1>=ard[j] or kk2<ard[j]:
return False
elif len(ar_temp)==1:
if ar_temp[0][:2]=='>=':
kk=float(ar_temp[0][2:])
if kk>=ard[j]:
return False
elif ar_temp[0][:1]=='<':
kk=float(ar_temp[0][1:])
if kk<ard[j]:
return False
else:
if k!=ard[j]:
return False
else:
if k!=ard[j]:
return False
return True
def model_compress(arm):
dic={}
t=len(arm[0])-3
for i in arm:
if tuple(i[:t-2]+[i[-3]]) not in dic.keys():
dic[tuple(i[:t-2]+[i[-3]])]=0
dic[tuple(i[:t-2]+[i[-3]])]+=i[-1]
dic2={}
for k in dic.keys():
if k[:t-2] not in dic2.keys():
dic2[k[:t-2]]=['',0,0]
if dic[k] > dic2[k[:t-2]][2]:
dic2[k[:t-2]]=[k[-1],0,dic[k]]
ar=[]
for k in dic2.keys():
ar.append(list(k)+dic2[k])
return ar
bg_dt = datetime.datetime.now()
#-------------------------------------------------------------------------------------
in_dz=r'C:\dataother.xlsx'
model_dz=r'C:\max_model - 测试20.xlsx'
out_dz=r'C:\datasetall classified 56w optim=0 测试20.xlsx'
optim=0
number_columns=[] #number_columns=['不含税价款(元)','税额(元)']
#-------------------------------------------------------------------------------------
df_in=dataread(in_dz,number_columns)
df_model=modelread(model_dz)
df_out=model_use(df_in,df_model,optim)
df_out.to_excel(out_dz,index=False)
diff=datetime.datetime.now()-bg_dt
diff_s=diff.days*24*60*60 + diff.seconds
print('Finished in ' + str(diff_s) + ' seconds')