1. 以 release
模式运行Python
python -O process_file.py
可以在代码中加入以下命令,判断是否为release模式:
if __debug__:
print("Debug mode")
else:
print("Release mode")
2.使用Cython
下载Cython:
pip install cython
编写pyx文件,即要编译的Python代码:
为了后面方便调用,你可以把需要运行的函数放到一个函数中,例如我放到了main()函数中
# process_file.pyx
# python -O process_file.py
import pandas as pd
from tqdm import tqdm
def clean_str(input:str)->str:
# u"\u3000": 全角空格
# u"\xa0": #nbsp
# output = input.strip()\
# .replace('"', '')\
# .replace(u"\u3000", "")\
# .replace(u"\xa0", "")\
# .replace("【", "")\
# .replace("】", "")\
# .replace(" ", "")
output = input.strip()\
.replace(u"\u3000", " ")\
.replace(u"\xa0", " ")\
.replace("【", "[")\
.replace("】", "]")
return output
def main():
file_in = "ownthink_v2\ownthink_v2.csv"
file_out = "ownthink_v2\ownthink_v2_cleaned.csv"
file_out_2 = "ownthink_v2\ownthink_v2_cleaned_rfiltered.csv"
chunk_size = 10000
# 逐块读取CSV文件
data_all = pd.read_csv(file_in, chunksize=chunk_size)# 139951300
# 进行数据清洗
lc = 0 # 计数
head_flag = True
for data_chunk in tqdm(data_all, total=13996):
# 删除含有 NAN 的行 和 空行
data_chunk = data_chunk.dropna()
# column_names_list = data_chunk.columns.tolist()
for index, row in data_chunk.iterrows():
# 实体,属性,值
entity = row["实体"]
attribution = row["属性"]
value = row["值"]
if entity == value:
# 过滤掉 实体 和 值 相等的情况(比如 “英雄联盟 中文名 英雄联盟”)
data_chunk = data_chunk.drop(index=index, axis="rows")
continue
# line = entity + attribution + value
# if "歧义关系" in line or "歧义权重" in line:
# data_chunk = data_chunk.drop(index=index, axis="rows")
# print(line)
# continue
# 进行清理,并赋值给 data_chunk
row["实体"] = clean_str(entity)
row["属性"] = clean_str(attribution)
row["值"] = clean_str(value)
lc += 1
# 写入文件
# mode = 'a'为追加数据,index为每行的索引序号,header为标题
if head_flag:
data_chunk.to_csv(file_out, mode='w', index=False, header=True, encoding="utf-8")
head_flag = False
else:
data_chunk.to_csv(file_out, mode='a', index=False, header=False, encoding="utf-8")
# if lc > 10000:
# break
print(lc)
编写setup.py文件,使得 Cython 可以将我们的 Python 代码编译成 C 代码:
# setup.py
from setuptools import setup
from Cython.Build import cythonize
setup(
ext_modules = cythonize('process_file.pyx')
)
接着,运行命令:
python setup.py build_ext --inplace
这样会生成build文件夹,.cpp文件,.pyd文件,其中,build文件夹
和 .pyd文件
是对你有用的;
你可以在Python代码中调用编译好的cython文件:
from process_file import main
main()
标签:Cython,运算,index,Python,chunk,replace,file,data,row
From: https://www.cnblogs.com/RakanLiu/p/18487177