首页 > 编程语言 >python实现海量分词

python实现海量分词

时间:2023-01-14 10:01:57浏览次数:62  
标签:python self addtbname db cursor 海量 sql hlseg 分词


其中,pyhlseg安装和使用参考
​​​https://toscode.gitee.com/ryan70/pyhlseg#https://gitee.com/ryan70/segEvalTool​

settings.py

hlseg_table_name='柘荣地名地址_hlseg'   #数据库名称

user_dict_path="HLseg/data/userdict.txt" #用户词典路径

data_path="HLseg/data/zrbzdz.txt" # 数据源路径

dbname="addresss"

create_hlseg_Mysql_1.py

import requests
from bs4 import BeautifulSoup
import re
import pymysql
import settings
class Mysqlhlseg(object):
dbname=settings.dbname
# 创建表
def conn(self):
return pymysql.connect(host='10.0.0.13',user='lyf',password='lyf',db=self.dbname,port=3486,charset='utf8')#连接数据库

# 创建表
def create(self,sql,addtbname):
db= self.conn()
cursor = db.cursor()
try:
cursor.execute(sql)
db.close()
print("'{}'数据库表创建成功!!!".format(addtbname))
except Exception as e:
print("'{}'数据库表已存在!!!".format(addtbname))

# 插入表数据
def insert_hlseg(self,hlsegone,addtbname):
db= self.conn()
cursor = db.cursor()
key_list = []
value_list=[]
sql = '''insert into {}(%s) value(%s)'''.format(addtbname)

for k,v in hlsegone.items():
if k!='id': #id是关键字,自动顺编,不需要人为写入
key_list.append(k)
value_list.append('%%(%s)s' % k)
sql = sql % (','.join(key_list),','.join(value_list))

try:
cursor.execute(sql,hlsegone)
db.commit()
print(addtbname,' 插入数据成功')
except Exception as e:
db.rollback()
# print("插入数据失败")
db.close()

# 更新表数据
def update_hlseg(self,hlsegone,results,addtbname):
db= self.conn()
cursor = db.cursor()
key_list = []
value_list=[]

sql = '''UPDATE {} SET %s WHERE id=%s'''.format(addtbname)
for k,v in hlsegone.items():
if k!='id': #id是关键字,自动顺编,不需要人为写入
key_list.append(k+'='+'%%(%s)s' % k)
# value_list.append('%%(%s)s' % k)
sql = sql % (','.join(key_list),60)

cursor.execute(sql,hlsegone)
# 提交到数据库执行
db.commit()
db.close()

def save_hlseg(self,hlsegone,addtbname):
""" 保存hlseg信息: 如果hlseg不存在就插入, 存在就更新"""
try:
results = self.query("SELECT * FROM {} where 'address_part'='%s'".format(addtbname) % (hlsegone['address_part']))
# 如果hlseg不存在就插入, 否则, 更新原来的hlseg信息
if len(results) == 0:
self.insert_hlseg(hlsegone,addtbname)
else:
self.update_hlseg(hlsegone,results[0],addtbname)
except Exception as e:
print(e)
# 查询表数据
def query(self,sql):
db= self.conn()
cursor = db.cursor()
cursor.execute(sql)
# 返回所有数据
class_list = cursor.fetchall()

cursor.close()
db.close()
return class_list
# 删除重复数据
def delete_repeat(self,addtbname,fieldname):
sql="delete from `{}` where id not in (select a.min_id from (select min(id) as min_id from `{}` group by `{}`)a)".format(addtbname,addtbname,fieldname)
db=self.conn()
cursor = db.cursor()
cursor.execute(sql)
cursor.close()
db.close()
if __name__ == "__main__":
# m=Mysqlhlseg()
# m.create()

pass
# create() #创建表

# #re匹配需要的数据
# pertern = re.compile(
# r'<img.*?data-original="(.*?)".*?<span class="search_now_price">(.*?)</span>.*?<a.*?单品作者.*?title="(.*?)">.*?</a>',
# re.S)
# #添加请求头 修改user-agent来伪装浏览器
# headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
# url = 'http://category.dangdang.com/cp01.19.34.00.00.00.html'
# res = requests.get(url,headers=headers)
# # print(res.status_code)
# soup = BeautifulSoup(res.text, 'html.parser')
# data = soup.find_all('ul', attrs={'class': 'bigimg'})
# data = str(data)
# item = re.findall(pertern, data)
# for i in item:
# # print(i)
# insert(i)

write_hlseg_mysql_2.py

import sys,os,time
sys.path.append(os.path.abspath(os.path.dirname(__file__) + '/' + '..'))

# import sys,time,os
# sys.path.append(".../")
import sys,os,time
# 引用数据库
from create_hlseg_Mysql_1 import Mysqlhlseg
Mysqlhlseg=Mysqlhlseg()
from settings import *
# 加载thulac分词模型

from pyhlseg import *
def load_hlseg():
HylandaSegment.start_jvm()
# HylandaSegment.load_dictionary(user_dict_path=HylandaSegment.BUILD_IN_USER_DICT)
HylandaSegment.load_dictionary(user_dict_path=user_dict_path)
# HylandaSegment.load_dictionary()
HylandaSegment.set_option(grain_size=GrainSize.LARGE)
# 如果分词选项multi_grain_size设为了True,则可以通过change_result_grain_size方法获取其他粒度的分词结果
#HylandaSegment.set_option(grain_size=GrainSize.LARGE, multi_grain_size=True)

def unload_hlseg():
HylandaSegment.shutdown_jvm()

if __name__ == "__main__":
load_hlseg()
t1 = time.time()
address_dict={}

sql="""CREATE TABLE {} (
id int(255) NOT NULL AUTO_INCREMENT,
address text COLLATE utf8_bin,
address_part text COLLATE utf8_bin,
PRIMARY KEY (id)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;""".format(hlseg_table_name)
Mysqlhlseg.create(sql,hlseg_table_name)
with open(data_path,"r",encoding='utf-8-sig') as f:
for line in f:
line = line.strip()
outs= HylandaSegment.segment(line)
seg_text = str(outs.toString())
# result=[out[0] for out in outs]
address_dict["address"]=line
address_dict["address_part"]=seg_text
Mysqlhlseg.insert_hlseg(address_dict,hlseg_table_name)

tt = time.time()-t1
print('time ' + str(tt))


#卸载分词
unload_hlseg()


标签:python,self,addtbname,db,cursor,海量,sql,hlseg,分词
From: https://blog.51cto.com/u_14736907/6007234

相关文章