import pandas as pd
from sqlalchemy import create_engine
# 从数据库中读取结果表数据到 DataFrame 中
engine = create_engine('mysql+pymysql://root:wwsa20030207@localhost/python_bigdata')
result_df = pd.read_sql('SELECT * FROM result_with_dimension', con=engine)
# 确定重复记录并进行清洗
duplicate_records = result_df[result_df.duplicated(subset=['地域', '成果名称'], keep=False)] # 找出重复记录
for idx, row in duplicate_records.iterrows():
# 保留一条记录
if idx == duplicate_records.index[0]:
continue # 跳过第一条记录,作为保留的记录
# 补充独有字段内容
for column in result_df.columns:
if pd.isna(result_df.loc[idx, column]) and not pd.isna(row[column]):
result_df.at[idx, column] = row[column]
# 删除其余记录
result_df.drop_duplicates(subset=['地域', '成果名称'], keep='first', inplace=True)
# 将处理后的数据写入 MySQL 数据库
result_df.to_sql('result_cleaned', con=engine, if_exists='replace', index=False)
# 关闭连接
engine.dispose()
标签:engine,idx,重复,column,df,result,pd,清洗,数据
From: https://www.cnblogs.com/lin513/p/18095621