命名实体识别
命名实体识别(Named Entity Recognition,简称NER)
与自动分词,词性标注一样,命名实体识别也是自然语言处理中的一个基础任务,其目的是识别语料中的人名、地名、组织机构名等命名实体。
基于规则的通常有两种方法
第一是基于正则表达式的匹配,第二可以通过StanfordCoreNLP
StanfordCoreNLP方法:
ner.py: 主调用文件,用来读取文本
#-*- coding=utf8 -*-
from grammer.rules import grammer_parse
fp = open('text.txt', 'r', encoding='utf8')
fout = open('out.txt','w',encoding='utf8')
# 此处可以直接遍历,也可以通过遍历readlines
[grammer_parse(line.strip(), fout) for line in fp if len(line.strip())>0]
fp.close()
fout.close()
rules.py: 该代码用来定义规则
#encoding=utf8
import nltk,json
from .tools import ner_stanford,cut_stanford
def get_stanford_ner_nodes(parent):
# 对得到的树进行遍历
date=''
num=''
org=''
loc=''
for node in parent:
if type(node) is nltk.Tree:
if node.label() == 'DATE' :
date=date+" "+''.join([i[0] for i in node])
elif node.label() == 'NUMBER':
num=num+" "+''.join([i[0] for i in node])
elif node.label() == 'ORGANIZATIONL' :
org=org+" "+''.join([i[0] for i in node])
elif node.label() == 'LOCATION':
loc=loc+" "+''.join([i[0] for i in node])
if len(num)>0 or len(date)>0 or len(org)>0 or len(loc)>0 :
return {'date':date,'num':num,'org':org,'loc':loc}
else:
return {}
def grammer_parse(raw_sentence=None,file_object=None):
#assert grammer_type in set(['hanlp_keep','stanford_ner_drop','stanford_pos_drop'])
# 如果文本太短,则直接跳过
if len(raw_sentence.strip())<5:
return False
# 定义语法:<DATE>+ 只要Date出现,一次或者多次,都是属于一个Date
grammer_dict=\
{
'stanford_ner_drop': r"""
DATE:{<DATE>+<MISC>?<DATE>*<O>{2}}
{<DATE>+<MISC>?<DATE>*}
{<DATE>+}
{<TIME>+}
ORGANIZATIONL:{<ORGANIZATION>+}
LOCATION:{<LOCATION|STATE_OR_PROVINCE|CITY|COUNTRY>+}
"""
}
# 通过NLTK来对语法进行解析
stanford_ner_drop_rp = nltk.RegexpParser(grammer_dict['stanford_ner_drop'])
try :
# ner_stanford(raw_sentence)就是将关键字命名体进行了识别,O指的意思是没有我们规定的类型
# 得到的stanford_ner_drop_result为draw类型,可以通过draw()方法进行绘制
stanford_ner_drop_result = stanford_ner_drop_rp.parse(ner_stanford(raw_sentence) )
except:
print("the error sentence is {}".format(raw_sentence))
else:
# 将得到的树类型的结果按照规则对结点进行合并
stanford_keep_drop_dict=get_stanford_ner_nodes(stanford_ner_drop_result)
if len(stanford_keep_drop_dict)>0 :
# 将字典写入文件,通过json.dumps将字符串转化为json数据
file_object.write(json.dumps(stanford_keep_drop_dict, skipkeys=False,
ensure_ascii=False,
check_circular=True,
allow_nan=True,
cls=None,
indent=4,
separators=None,
default=None,
sort_keys=False))
tools.py: 用来支持stanford_nlp
#encoding=utf8
import os,gc,re,sys
from itertools import chain
from stanfordcorenlp import StanfordCoreNLP
import logging
from jpype import *
startJVM(getDefaultJVMPath(),r"-Djava.class.path=E:\NLP\hanlp\hanlp-1.5.0.jar;E:\NLP\hanlp",
"-Xms1g",
"-Xmx1g")
NLPTokenizer = JClass('com.hankcs.hanlp.tokenizer.StandardTokenizer')
stanford_nlp = StanfordCoreNLP(r'E:\NLP\stanford-corenlp-full-2018-10-05', lang='zh', quiet=False, logging_level=logging.DEBUG)
# stanford_nlp = StanfordCoreNLP(r'E:\NLP\stanford-corenlp-full-2018-10-05', lang='zh')
drop_pos_set=set(['xu','xx','y','yg','wh','wky','wkz','wp','ws','wyy','wyz','wb','u','ud','ude1','ude2','ude3','udeng','udh'])
han_pattern=re.compile(r'[^\dA-Za-z\u3007\u4E00-\u9FCB\uE815-\uE864]+')
def to_string(sentence,return_generator=False):
if return_generator:
return (word_pos_item.toString().split('/') for word_pos_item in Tokenizer.segment(sentence))
else:
# res=[(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)]
return [(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)]
def to_string_hanlp(sentence,return_generator=False):
if return_generator:
return (word_pos_item.toString().split('/') for word_pos_item in HanLP.segment(sentence))
else:
# res=[(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)]
return [(word_pos_item.toString().split('/')[0],word_pos_item.toString().split('/')[1]) for word_pos_item in Tokenizer.segment(sentence)]
def seg_sentences(sentence,with_filter=True,return_generator=False):
segs=to_string(sentence,return_generator=return_generator)
#print(segs)
#g=[]
if with_filter:
g = [word_pos_pair[0] for word_pos_pair in segs if len(word_pos_pair)==2 and word_pos_pair[0]!=' ' and word_pos_pair[1] not in drop_pos_set]
else:
g = [word_pos_pair[0] for word_pos_pair in segs if len(word_pos_pair)==2 and word_pos_pair[0]!=' ']
return iter(g) if return_generator else g
def ner_stanford(raw_sentence,return_list=True):
if len(raw_sentence.strip())>0:
return stanford_nlp.ner(raw_sentence) if return_list else iter(stanford_nlp.ner(raw_sentence))
def ner_hanlp(raw_sentence,return_list=True):
if len(raw_sentence.strip())>0:
return NLPTokenizer.segment(raw_sentence) if return_list else iter(NLPTokenizer.segment(raw_sentence))
def cut_stanford(raw_sentence,return_list=True):
if len(raw_sentence.strip())>0:
return stanford_nlp.pos_tag(raw_sentence) if return_list else iter(stanford_nlp.pos_tag(raw_sentence))