bdict格式是百度输入法的词库文件,那么怎么看里面的内容呢? 这就需要用到bdict到txt的转化。
比如下载“菜名大全” 文件名:dict_file_734_20111227170031_1.0.0.bdict
文件大小134924字节
转换代码参考这篇文档:【搜狗&百度词库】.bdict文件与.scel转txt_scel文件在线-CSDN博客
import struct
import binascii
class Baidu(object):
def __init__(self, originfile):
self.originfile = originfile
self.lefile = originfile + '.le'
self.txtfile = originfile[0:(originfile.__len__()-5)] + 'txt'
self.buf = [b'0' for x in range(0,2)]
self.listwords = []
# 字节流大端转小端
def be2le(self):
of = open(self.originfile,'rb')
lef = open(self.lefile, 'wb')
contents = of.read()
contents_size = contents.__len__()
mo_size = (contents_size % 2)
# 保证是偶数
if mo_size > 0:
contents_size += (2-mo_size)
contents += contents + b'0000'
# 大小端交换
for i in range(0, contents_size, 2):
self.buf[1] = contents[i]
self.buf[0] = contents[i+1]
le_bytes = struct.pack('2B', self.buf[0], self.buf[1])
lef.write(le_bytes)
print('写入成功转为小端的字节流')
of.close()
lef.close()
def le2txt(self):
lef = open(self.lefile, 'rb')
txtf = open(self.txtfile, 'w')
# 以字符串形式读取转成小端后的字节流,百度词典的起始位置为0x350
le_bytes = lef.read().hex()[0x350:]
i = 0
while i<len(le_bytes):
result = le_bytes[i:i+4]
i+=4
# 将所有字符解码成汉字,拼音或字符
content = binascii.a2b_hex(result).decode('utf-16-be')
# 判断汉字
if '\u4e00' <= content <= '\u9fff':
self.listwords.append(content)
else:
if self.listwords:
word = ''.join(self.listwords)
txtf.write(word + '\n')
self.listwords = []
print('写入txt成功')
lef.close()
txtf.close()
if __name__ == '__main__':
path = '你的.bdict文件'
bd = Baidu(path)
bd.be2le()
bd.le2txt()
略微修改了,提高了效率,降低了磁盘写操作频率,可以在命令行执行时跟需要转换的文件名:
import struct
import binascii
import sys
class Baidu(object):
def __init__(self, originfile):
self.originfile = originfile
self.lefile = originfile + '.le'
self.txtfile = originfile[0:(originfile.__len__()-5)] + 'txt'
self.buf = [b'0' for x in range(0,2)]
self.listwords = []
# 字节流大端转小端
def be2le(self):
of = open(self.originfile,'rb')
lef = open(self.lefile, 'wb')
contents = of.read()
contents_size = contents.__len__()
mo_size = (contents_size % 2)
# print("====mozie", mo_size)
# 保证是偶数
if mo_size > 0:
contents_size += (2-mo_size)
contents += contents + b'0000'
mo_size = (contents_size % 2)
# print("====mozie", mo_size)
# 大小端交换
tmp = b""
for i in range(0, contents_size, 2):
self.buf[1] = contents[i]
self.buf[0] = contents[i+1]
le_bytes = struct.pack('2B', self.buf[0], self.buf[1])
tmp = tmp + le_bytes
lef.write(tmp)
print('写入成功转为小端的字节流')
of.close()
lef.close()
def le2txt(self):
lef = open(self.lefile, 'rb')
txtf = open(self.txtfile, 'w')
# 以字符串形式读取转成小端后的字节流,百度词典的起始位置为0x350
le_bytes = lef.read().hex()[0x350:]
print(f'====len bytes of lefile:{len(le_bytes)}')
i = 0
tmpword = ""
while i<len(le_bytes):
result = le_bytes[i:i+4]
i+=4
# 将所有字符解码成汉字,拼音或字符
content = binascii.a2b_hex(result).decode('utf-16-be')
# print(content)
# 判断汉字
if '\u4e00' <= content <= '\u9fff':
self.listwords.append(content)
else:
if self.listwords:
word = ''.join(self.listwords)
# txtf.write(word + '\n')
word = word + '\n'
tmpword = tmpword + word
self.listwords = []
# print(tmpword)
txtf.write(tmpword)
print('写入txt成功')
lef.close()
txtf.close()
if __name__ == '__main__':
try:
file_path = sys.argv[1]
path = file_path
except :
path = '8food.bdict'
bd = Baidu(path)
bd.be2le()
bd.le2txt()
存盘为bd.py文件
执行转换
python bd.py
程序会自动读取8food.bdict文件,并转为8food.txt文件
也可以跟文件名,比如下载的8大菜系文件是 dict_file_734_20111227170031_1.0.0.bdict:
python bd.py dict_file_734_20111227170031_1.0.0.bdict
生成文件为dict_file_734_20111227170031_1.0.0.txt,内容为菜名:
阿胶炖肉
阿胶牛肉汤
阿胶养阴粥
阿胶养阴粥
安阳三熏
鹌鹑蛋烧稚
鹌鹑枸杞粥
鹌鹑枸杞粥
熬黄花鱼
八宝菠菜
这样百度输入法词库就转换为txt文件了!
标签:bdict,lef,self,格式文件,originfile,txt,buf,contents,size From: https://blog.csdn.net/skywalk8163/article/details/142882183