import pandas as pd
import re
# 获取所有句子,并且没有重复值
df1 = pd.read_csv("无标题.csv",encoding='gbk')
col1 = df1[["usermsg"]]
df2 = pd.read_csv("无标题.csv",encoding='gbk')
col2 = df2[["usermsg"]]
col = pd.concat([col1, col2])
col = col.drop_duplicates()
col = col["usermsg"].apply(lambda x: x.strip())
# 剔除含有手机号的元素
new_data = []
for i in col:
pattern2 = "(?<!\d)(1\d{10})(?!\d)"
phone_list2 = re.compile(pattern2).findall(i)
if not phone_list2:
new_data.append(i)
# 剔除网站的元素
new_data1 = []
for i in new_data:
if not ("https://" in i):
new_data1.append(i)
# 剔除纯数字的元素
new_data2 = []
for i in new_data1:
if not i.isdigit():
new_data2.append(i)
# 剔除纯英文的元素
new_data3 = []
for i in new_data2:
if not i.encode("UTF-8").isalpha():
new_data3.append(i)
# 剔除纯英文和纯数字的元素
new_data4 = []
for i in new_data3:
if not i.encode("UTF-8").isalnum():
new_data4.append(i)
# 剔除小数
new_data5 = []
for i in new_data4:
if not re.search("\d+(\.\d+)?", i):
new_data5.append(i)
# 剔除纯符号和纯字母
def clean(desstr, restr=""):
# 过滤表情
try:
co = re.compile(
"[" "\U0001F300-\U0001F64F" "\U0001F680-\U0001F6FF" "\u2600-\u2B55]+"
)
except re.error:
co = re.compile(
"("
"\ud83c[\udf00-\udfff]|"
"\ud83d[\udc00-\ude4f\ude80-\udeff]|"
"[\u2600-\u2B55])+"
)
return co.sub(restr, desstr)
new_data6 = []
for i in new_data5:
new_data6.append(clean(i))
new_df = pd.DataFrame(data=new_data6)
new_df = new_df.dropna()
new_df.to_csv("new_df.csv")
标签:usermsg,csv,手机号,无标题,pd,去除,文本,col
From: https://www.cnblogs.com/tiansz/p/17176589.html