刚开始接触自然语言处理,记下一些NLTK函数:
第一步:哈啊哈
>>> from nltk.book import *
*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
正文:
>>> fdist=FreqDist(text1)
>>> fdist['the']
13721
在text1中the出现的次数
>>> fdist.freq('the')
0.052607363727335814////////////////源代码///////////////////////////
def freq(self, sample):"""Return the frequency of a given sample. The frequency of asample is defined as the count of that sample divided by thetotal number of sample outcomes that have been recorded bythis FreqDist. The count of a sample is defined as thenumber of times that sample outcome was recorded by thisFreqDist. Frequencies are always real numbers in the range[0, 1].:param sample: the sample whose frequencyshould be returned.:type sample: any:rtype: float"""if self.N() == 0:return 0return self[sample] / self.N()
the 的频率
>>> fdist.N()
260819
////////////////源代码///////////////////////////
def N(self):"""Return the total number of sample outcomes that have beenrecorded by this FreqDist. For the number of uniquesample values (or bins) with counts greater than zero, use``FreqDist.B()``.:rtype: int"""return sum(self.values())
返回样本总数,对于独特(永远大于0)的值用下一个函数
>>> fdist.B()
19317
返回所有的样本值
>>> fdist.hapaxes()
[u'funereal', u'unscientific', u'prefix', u'plaudits', u'woody', u'disobeying', u'Westers', u'DRYDEN', u'Untried', u'superficially', u'vesper', u'Western', u'Spurn', u'treasuries',........]//代码
def hapaxes(self):"""Return a list of all samples that occur once (hapax legomena):rtype: list"""return [item for item in self if self[item] == 1]验证一下//
//funereal的频率*样本总数
>>> fdist.freq('funereal')*fdist.N()
1.0
出现一次样品的列表
>>> fdist.max()
u','//代码
def max(self):"""Return the sample with the greatest number of outcomes in thisfrequency distribution. If two or more samples have the samenumber of outcomes, return one of them; which sample isreturned is undefined. If no outcomes have occurred in thisfrequency distribution, return None.:return: The sample with the maximum number of outcomes in thisfrequency distribution.:rtype: any or None"""if len(self) == 0:raise ValueError('A FreqDist must have at least one sample before max is defined.')st_common(1)[0][0]
频率分布最高的样本
>>> fdist.pformat(maxlen=10)
u"FreqDist({u',': 18713, u'the': 13721, u'.': 6862, u'of': 6536, u'and': 6024, u'a': 4569, u'to': 4542, u';': 4072, u'in': 3916, u'that': 2982, ...})"//代码def max(self):"""Return the sample with the greatest number of outcomes in thisfrequency distribution. If two or more samples have the samenumber of outcomes, return one of them; which sample isreturned is undefined. If no outcomes have occurred in thisfrequency distribution, return None.:return: The sample with the maximum number of outcomes in thisfrequency distribution.:rtype: any or None"""if len(self) == 0:raise ValueError('A FreqDist must have at least one sample before max is defined.')st_common(1)[0][0]
按照频率由高到低输出前(参数)个
>>> fdist.keys()
[u'funereal', u'unscientific', u'divinely', u'foul', u'four', u'gag', u'prefix', u'woods'......]
以频率递减顺序排序的样本链表
>>> fdist.plot(90,cumulative=True)
绘制频率分布图(频率由高到低)
第一个参数—–横轴的点的数量
第二个参数—–是否累加(默认false)
>>> fdist.pprint(30)
FreqDist({u',': 18713, u'the': 13721, u'.': 6862, u'of': 6536, u'and': 6024, u'a': 4569, u'to': 4542, u';': 4072, u'in': 3916, u'that': 2982, u"'": 2684, u'-': 2552, u'his': 2459, u'it': 2209, u'I': 2124, u's': 1739, u'is': 1695, u'he': 1661, u'with': 1659, u'was': 1632, u'as': 1620, u'"': 1478, u'all': 1462, u'for': 1414, u'this': 1280, u'!': 1269, u'at': 1231, u'by': 1137, u'but': 1113, u'not': 1103, ...})//代码
def pprint(self, maxlen=10, stream=None):"""Print a string representation of this FreqDist to 'stream':param maxlen: The maximum number of items to print:type maxlen: int:param stream: The stream to print to. stdout by default"""print(self.pformat(maxlen=maxlen), file=stream)
流输出前(参数)个样本
>>> fdist.r_Nr()
defaultdict(<type 'int'>, {68: 9, 1: 9002, 2: 3193, 3: 1721, 4: 968, 5: 695, 6: 497, 7: 384, 8: 318, 9: 253, 10: 196, 11: 190, 12: 152, 13: 112, 14: 100, 15: 107, 16: 98, 17: 71, 18: 69, 19: 58, 20: 50, 21: 35, 22: 37, 23: 43, 24: 38, 25: 38, 26: 34, 27: 25, 28: 32, 29: 26, 30: 27, 31: 20, 32: 21, 33: 16, ......]//代码
def r_Nr(self, bins=None):"""Return the dictionary mapping r to Nr, the number of samples with frequency r, where Nr > 0.:type bins: int:param bins: The number of possible sample outcomes. ``bins``is used to calculate Nr(0). In particular, Nr(0) is``bins-self.B()``. If ``bins`` is not specified, itdefaults to ``self.B()`` (so Nr(0) will be 0).:rtype: int"""_r_Nr = defaultdict(int)for count in self.values():_r_Nr[count] += 1# Special case for Nr[0]:_r_Nr[0] = bins - self.B() if bins is not None else 0return _r_Nr
不是很了解 求教
>>> fdist.tabulate(10,cumulative=True), the . of and a to ; in that
18713 32434 39296 45832 51856 56425 60967 65039 68955 71937
>>> fdist.tabulate(10), the . of and a to ; in that
18713 13721 6862 6536 6024 4569 4542 4072 3916 2982 //代码
def tabulate(self, *args, **kwargs):"""Tabulate the given samples from the frequency distribution (cumulative),displaying the most frequent sample first. If an integerparameter is supplied, stop after this many samples have beenplotted.:param samples: The samples to plot (default is all samples):type samples: list:param cumulative: A flag to specify whether the freqs are cumulative (default = False):type title: bool"""if len(args) == 0:args = [len(self)]samples = [item for item, _ st_common(*args)]cumulative = _get_kwarg(kwargs, 'cumulative', False)if cumulative:freqs = list(self._cumulative_frequencies(samples))else:freqs = [self[sample] for sample in samples]# percents = [f * 100 for f in freqs] only in ProbDist?width = max(len("%s" % s) for s in samples)width = max(width, max(len("%d" % f) for f in freqs))for i in range(len(samples)):print("%*s" % (width, samples[i]), end=' ')print()for i in range(len(samples)):print("%*d" % (width, freqs[i]), end=' ')print()
制表
第一个参数—–个数
第二个参数—–是否累加
>>> st_common()
[(u',', 18713), (u'the', 13721), (u'.', 6862), (u'of', 6536), (u'and', 6024), (u'a', 4569), (u'to', 4542), (u';', 4072), (u'in', 3916), (u'that', 2982), (u"'", 2684), (u'-', 2552), (u'his', 2459),.....]
返回样本中各个词的个数,从高到低排列
>>> sent=['i','m','aa','bb','cc']
>>> import nltk
>>> nltk.bigrams(sent)
>>> for i in nltk.bigrams(sent):
... print i
...
('i', 'm')
('m', 'aa')
('aa', 'bb')
('bb', 'cc')
>>>
将sent中的词两两组合
本文发布于:2024-01-31 03:21:37,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170664250224999.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |