方法:找到源码的变量进行修改
示例:使【BladeMaster】这类中间有
空格
的词被识别
import jieba, re
sentence = 'Blade Master疾风刺杀Archmage'
jieba.add_word('Blade Master') # 添词
print([word for word in jieba.cut(sentence)])
_han_default = repile('([u4E00-u9FD5a-zA-Z0-9+#&._% -]+)', re.U) # 修改格式
print([word for word in jieba.cut(sentence)])
import jieba, jieba.posseg as jp, re
sentence = 'Demon Hunter斩杀大法师'
jieba.add_word('Demon Hunter', 9, 'hero') # 添词
jp.re_han_internal = repile('(.+)', re.U) # 修改格式
print(jp.lcut(sentence))
import jieba, jieba.posseg as jp, re
sentence = 'D H的D H的DH'
# 修改格式
jp.re_han_internal = repile('(.+)', re.U)
# 添词
jieba.add_word('D H')
jieba.add_word('的', tag='DE')
# 打印
print(jp.lcut(sentence))
print(jp.lcut(sentence, HMM=False))
from jieba import dt
from jieba.posseg import POSTokenizertext = '正义的洪基伟斩杀邪恶的巨法师'pos_dt = POSTokenizer(dt)
print(pos_dt.lcut(text))dt.add_word('巨法师', 1, 'DDD')
print(pos_dt.lcut(text))dt.add_word('的', tag='DE')
print(pos_dt.lcut(text))
DDD
’)] uj
’), pair(‘洪基伟’, ‘nr’), pair(‘斩杀’, ‘v’), pair(‘邪恶’, ‘a’), pair(‘的’, ‘ DE
’), pair(‘巨法师’, ‘DDD’)] import jieba
s = 'apple均价93600'
print(' '.join(jieba.cut(s)))
jieba.add_word('360')
jieba.add_word('app')
print(' '.join(jieba.cut(s)))
app
le 均价 9 360
0 from os import path
import re
import jieba
from math import log
jieba_dict = path.join(path.dirname(jieba.__file__), ')class Token:re_eng = repile('[a-zA-Z][a-zA-Z0-9_-]*')re_m = repile('[0-9][0-9.+%/~-]*') # jieba数词标注为mdef __init__(self, dt, total, max_len):self.dt = al = totalself.max_len = max_len@classmethoddef initialization(cls):with open(jieba_dict, encoding='utf-8') as f:dt = {line.split()[0]: int(line.split()[1]) for line ad().strip().split('n')}# 总频数total = sum(dt.values())# 词最大长度,默认等于词典最长词(超长英文符会识别不出来)max_len = max(len(i) for i in dt.keys())return cls(dt, total, max_len)def _get_DAG(self, sentence):length = len(sentence)dt = dict()for head in range(length):tail = min(head + self.max_len, length)dt.update({head: [head]})for middle in range(head + 2, tail + 1):word = sentence[head: middle]# ------------- 词典 + 正则 ------------- #if word in self.dt:dt[head].append(middle - 1)_eng.fullmatch(word):dt[head].append(middle - 1)_m.fullmatch(word):dt[head].append(middle - 1)return dtdef _calculate(self, sentence):DAG = self._get_DAG(sentence)route = dict()N = len(sentence)route[N] = (0, 0)logtotal = al)for idx in range(N - 1, -1, -1):route[idx] = max((log((sentence[idx:x + 1], 1)) - logtotal + route[x + 1][0], x)for x in DAG[idx])return routedef cut(self, sentence):route = self._calculate(sentence)x = 0N = len(sentence)while x < N:y = route[x][1] + 1l_word = sentence[x:y]yield l_wordx = ydef lcut(self, sentence):return list(self.cut(sentence))def add_word(self, word, freq=1):original_freq = (word, 0)self.dt[word] = al = al - original_freq + freqdef del_word(self, word):original_freq = (word)if original_freq is not None:del self.dt[al -= original_freqtokenizer = Token.initialization()
cut = tokenizer.cut
lcut = tokenizer.lcut
add_word = tokenizer.add_word
del_word = tokenizer.del_wordif __name__ == '__main__':s = '小米60r价值3660rmb'print(' '.join(jieba.cut(s)))jieba.add_word('60r')print(' '.join(jieba.cut(s)))print('——')print(' '.join(cut(s)))add_word('60r')print(' '.join(cut(s)))
60r
mb 60
r
价值 3660 rmb from jieba import dt
print(dt.FREQ)
with open(', 'w', encoding='utf-8') as f:f.write('柳梦璃 99 nr')
dt.load_userdict(')
print('柳梦璃', dt.FREQ['柳梦璃'])
print('不', dt.FREQ['不'])
dt.add_word('不')
print('不', dt.FREQ['不'])
dt.add_word('不', 9)
print('不', dt.FREQ['不'])
dt.FREQ初始化前是空字典{}
初始化的时候会打印一大串红色
文字,初始化完成后,dt.FREQ才不为空
本文发布于:2024-01-31 00:09:37,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170663098223825.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |