Tokenization#

  • Chopping down a sentence into individual words/ group of words or tokens.

  • Removing punctuations, special characters.

  • Technique to simplify a corpus to prepare it for next stage of processing.

[1]:
import nltk

text = "Hello! I am Nishant. I am an engineer, and I like to build things."

sentence tokenizer#

[2]:
sentences = nltk.sent_tokenize(text)
sentences
[2]:
['Hello!', 'I am Nishant.', 'I am an engineer, and I like to build things.']

word tokenizer#

[3]:
words = nltk.word_tokenize(text)

words
[3]:
['Hello',
 '!',
 'I',
 'am',
 'Nishant',
 '.',
 'I',
 'am',
 'an',
 'engineer',
 ',',
 'and',
 'I',
 'like',
 'to',
 'build',
 'things',
 '.']

Word Frequencies#

[4]:
wordFreq = nltk.FreqDist(words)
print(f"""
    Word Frequencies : {wordFreq.elements}
    2 Most Common    : {wordFreq.most_common(2)}
""")

    Word Frequencies : <bound method Counter.elements of FreqDist({'I': 3, 'am': 2, '.': 2, 'Hello': 1, '!': 1, 'Nishant': 1, 'an': 1, 'engineer': 1, ',': 1, 'and': 1, ...})>
    2 Most Common    : [('I', 3), ('am', 2)]

Importing Items of book#

[5]:
from nltk.book import *
*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908

find#

[6]:
print(text1.findall("<tri.*r>"))
triangular; triangular; triangular; triangular
None

word count#

[7]:
print(len(text1))
260819

unique word count#

[8]:
print(len(set(text1)))
19317

transforming words#

[9]:
print(len(set([word.lower() for word in set(text1)])))
17231

word coverage#

[10]:
print(len(text1) / len(set(text1)))
13.502044830977896

filtering#

[12]:
[word for word in set(text1) if word.startswith('Sun')]
[12]:
['Sunda', 'Sunday', 'Sunset']