首页 > 其他分享 >搜狗细胞词库处理代码(可用于scel转txt)

搜狗细胞词库处理代码(可用于scel转txt)

时间:2023-04-25 14:01:00浏览次数:37  
标签:搜狗 QByteArray int scel pos unsigned fData 词库 startPos



今天先贴个简单代码,稍后再详细叙述…… 

基于QT实现,主要是考虑Unicode字符处理的方便,

可以稍加处理用于C或C++语言。

 

// 取连续两字节,转换为short类型的值,字节顺序是低字节-高字节
inline unsigned short GetUShort(QByteArray &fData, int startPos)
{
unsigned char low = fData.at(startPos);
unsigned char high = fData.at(startPos + 1);
unsigned short st = low + (high * 256);
 
return st;
}
 
//把 Unicode编码的值转换为字符串
inline QString GetStrvalue(QByteArray &fData, int startPos, int len)
{
QString temp;
 
for (int i = 0 ;i < len ; i+=2)
{
unsigned short st = GetUShort(fData,startPos + i);
QChar ch = QChar(st);
 
temp.append(ch);
if (st == 0)
{
break;
}
}
 
return temp;
}
 
//临时保存结果
class CHanzi
{
public:
void SetPy(QByteArray &py)
{
m_py = py;
}
 
void SetHz(QByteArray &hz)
{
m_hz = GetStrvalue(hz,0,hz.size());
}
 
void Debug()
{
QString py = "";
 
for (int i = 0 ; i < m_py.length() ; i += 2)
{
py += pyList.at(GetUShort(m_py,i));
}
 
qDebug() << m_hz << ":" << py;
}
 
private:
QByteArray m_py;
QString m_hz;
};
 
//1、读取拼音表
inline void ReadPyTable(QByteArray &fData,QList<QString> &pyList)
{
int startPos = 0x1540;
 
QByteArray fFlag = fData.mid(startPos,4);
if( fData.data() == "/x9D/x01/x00/x00")
{
qDebug() << "读取词库拼音表失败!";
return;
}
 
int pos = 4;
while (true)
{
//取bit的索引号,用于表示一个拼音信息
unsigned short n = GetUShort(fData,startPos + pos);
pos += 2;
 
//取bit的拼音长度,字母数的倍
unsigned short len = GetUShort(fData,startPos + pos);
pos += 2;
 
//每个字母占bit,Unicode编码
QString py = GetStrvalue(fData,startPos + pos,len);
qDebug() << n << ":" << py;
pyList.push_back(py);
pos += len;
 
//zuo是最后一个拼音,处理完成
if (py == "zuo" || pos + startPos >= fData.length())
{
break;
}
}
}
 
//2、读取汉字表
inline void ReadHzTable(QByteArray &fData,QList<CHanzi> &hzList)
{
int startPos = 0x2628;
int pos = 0;
 
while (true)
{
//偏移加权,16bit,用于计算下一记录的位置
unsigned short offset = GetUShort(fData,startPos + pos) - 1;
pos += 2;
 
//拼音长度,16bit,每个拼音用一个bit的short表示,其值是拼音表的索引号
unsigned short len1 = GetUShort(fData,startPos + pos);
pos += 2;
 
CHanzi hanzi;
//取拼音索引信息
hanzi.SetPy(fData.mid(startPos + pos,len1));
pos += len1;
 
//索引之后是词组的长度,字节数,字数乘
unsigned short len2 = GetUShort(fData,startPos + pos);
pos += 2;
 
//Unicode编码,每个汉字bit
hanzi.SetHz(fData.mid(startPos + pos,len2));
hzList.push_back(hanzi);
pos += len2;
 
//到下个字的偏移位置,没有再细分析,可能是词频或者类似信息
pos += (12 + offset * (12 + len2 + 2));
if (pos + startPos >= fData.length())
{
break;
}
}
}
 
//3、读取名称、类别、信息、示例
inline void ReadFileInfo(QByteArray &fData)
{
QByteArray fFlag = fData.mid(0,8);
if(fFlag.data() == "/x40/x15/x00/x00/x44/x43/x53/x01")
{
qDebug() << "确认你选择的是搜狗(.scel)词库?";
return ;
}
 
QString strName;
strName = GetStrvalue(fData,0x130,128);
qDebug() << "词库名:" << strName;
 
strName = GetStrvalue(fData,0x338,128);
qDebug() << "词库类型:" << strName;
strName = GetStrvalue(fData,0x540,128);
qDebug() << "描述信息:" << strName;
strName = GetStrvalue(fData,0xd40,128);
qDebug() << "词库示例:" << strName;
}

标签:搜狗,QByteArray,int,scel,pos,unsigned,fData,词库,startPos
From: https://blog.51cto.com/u_15408625/6223825

相关文章