python3 安装jieba:
pip3 install jieba
或者,先下载
http://pypi.python.org/pypi/jieba/ ,解压后运行 python setup.py install
参考:https://github.com/fxsjy/jieba
实例:
得到标签和创建mydict
import requests
from pyquery import PyQuery as pq
from db import MongoClient
from config import MY_DICT
db = MongoClient()
def get_label(url):
r = requests.get(url)
r.encoding = 'utf-8' # 通过r.encoding设置页面编码
doc = pq(r.text)
table = doc.find('body > div.body-wrapper > div.content-wrapper > div > div.main-content > table:nth-child(154) > tr').items()
id = 0
for tr in table:
if tr.find('td'): # 去掉th
'''
页面table有问题,单独修改一下
db.getCollection('hero').update(
// query
{
"id" : 2
},
// update
{ '$set' : {'hero_name' : '幻翎', 'hero_name_list' : ['幻翎', '洛'], "join_time" : "2017年4月18日"}
},
// options
{
"multi" : false, // update only one document
"upsert" : false // insert a new document, if no existing document match the query
}
);
'''
id += 1
hero_name = tr.find('td:nth-child(2)').text().strip()
hero_name_list = []
hero_name_list.append(tr.find('td:nth-child(2)').text().strip())
hero_name_list.append(tr.find('td:nth-child(3)').text().strip())
join_time = tr.find('td:nth-child(6)').text().strip()
msg = {
'id' : id,
'hero_name' : hero_name,
'hero_name_list' : hero_name_list,
'join_time' : join_time
}
db.save(msg)
def make_mydict():
with open(MY_DICT, mode='w', encoding='utf-8') as f:
for name in db.get_hero_name_list():
print(name, file=f) # 直接换行
if __name__ == '__main__':
# get_label('http://baike.baidu.com/item/英雄联盟/4615671#4')
make_mydict() # 创建词典
print('ok...')
分词器
import jieba
import jieba.analyse
from db import MongoClient
from config import MY_DICT
class Tokenizer(object):
def __init__(self):
self._db = MongoClient()
# 载入自己的词库
jieba.load_userdict(MY_DICT)
def get_hero_list(self):
hero_list = []
with open(MY_DICT, mode='r', encoding='utf-8') as f:
for hero in f:
hero_list.append(hero.strip())
return hero_list
def participle(self):
hero_list = self.get_hero_list()
print('/'.join(hero_list))
for room in self._db.get_rooms():
# 分词 [默认精确]
msg = jieba.lcut(room['r_name'])
label_list = set([w for w in msg if w in hero_list]) # 去重复
self._db.set_label(query={'r_id' : room['r_id']},
data={'$set' : {'r_label' : list(label_list)}})
print(msg, label_list)
if __name__ == '__main__':
# 分词器
tokenizer = Tokenizer()
tokenizer.participle()
db
import pymongo
from config import *
class MongoClient(object):
def __init__(self):
self._client = pymongo.MongoClient(MONGO_URL)
def get_rooms(self):
db = self._client[MONGO_DB]
for room in db[MONGO_TABLE].find(): # 去掉limit
yield {
'r_id' : room['r_id'],
'r_name' : room['r_name']
}
def set_label(self, **kwargs):
self._client[MONGO_DB][MONGO_TABLE].\
update(kwargs['query'], kwargs['data'], upsert=False)
def save(self, msg):
try:
self._client[MONGO_DB][MONGO_HERO_NAME].insert(msg)
except Exception as e:
print("e: ", e)
def get_hero_name_list(self):
for hero_name in self._client[MONGO_DB][MONGO_HERO_NAME].find():
for name in hero_name['hero_name_list']:
yield name
问题:
1. 运行的文件名和import xxx 的包名重复
import jieba
jieba.cut("我来到北京清华大学")
AttributeError: 'module' object has no attribute 'cut'
不要将运行的文件名命名为jieba.py,自己撸自己当然出错了