-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathNatural_Language_Toolkit.py
More file actions
85 lines (70 loc) · 3.23 KB
/
Natural_Language_Toolkit.py
File metadata and controls
85 lines (70 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import nltk
from nltk.tokenize import TreebankWordTokenizer, WordPunctTokenizer, RegexpTokenizer, TweetTokenizer, PunktSentenceTokenizer, WhitespaceTokenizer
from collections import Counter
# Sample text for tokenization
text = "He He said, 'Hello!' #greeting @someone 😊 How are you?"
def displayTokenizationResults(tokenizer_name, tokens):
"""
Displays the tokenization results in a clear and formatted manner.
Args:
tokenizer_name: The name of the tokenizer used for tokenization.
tokens: A list of tokens generated by the tokenizer.
"""
# Type and value checks
if not isinstance(tokenizer_name, str):
raise TypeError("`tokenizer_name` must be a string.")
if not isinstance(tokens, list):
raise TypeError("`tokens` must be a list.")
# Ensure inputs are not empty
if not tokenizer_name:
raise ValueError("`tokenizer_name` cannot be empty.")
if not tokens:
raise ValueError("`tokens` cannot be empty.")
# Display the tokenizer name and results
print(f"\n{tokenizer_name}:\n{'-'*len(tokenizer_name)}")
for token in tokens:
print(token)
# 1. TreebankWordTokenizer - Tokenizes text based on the Penn Treebank conventions.
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(text)
displayTokenizationResults("TreebankWordTokenizer", treebank_tokens)
# 2. WordPunctTokenizer - Splits text into words and punctuation.
wordpunct_tokenizer = WordPunctTokenizer()
wordpunct_tokens = wordpunct_tokenizer.tokenize(text)
displayTokenizationResults("WordPunctTokenizer", wordpunct_tokens)
# 3. RegexpTokenizer - Tokenizes using regular expressions.
regexp_tokenizer = RegexpTokenizer(r'\w+')
regexp_tokens = regexp_tokenizer.tokenize(text)
displayTokenizationResults("RegexpTokenizer (only words)", regexp_tokens)
# 4. TweetTokenizer - Handles tokenization of tweets (hashtags, mentions, emojis).
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)
displayTokenizationResults("TweetTokenizer", tweet_tokens)
# 5. SentTokenizer (Punkt) - Splits text into sentences.
sent_tokenizer = PunktSentenceTokenizer()
sentences = sent_tokenizer.tokenize(text)
displayTokenizationResults("SentTokenizer (Punkt)", sentences)
# 6. WhitespaceTokenizer - Splits text based on whitespaces.
whitespace_tokenizer = WhitespaceTokenizer()
whitespace_tokens = whitespace_tokenizer.tokenize(text)
displayTokenizationResults("WhitespaceTokenizer", whitespace_tokens)
def displayTokenFrequency(tokens):
"""
Displays the frequency of each token in a formatted manner.
"""
token_counts = Counter(tokens)
print("\nToken Frequencies:")
print("-------------------")
for token, count in token_counts.items():
print(f"{token}: {count}") # Data for visualization
tokens, counts = zip(*token_counts.items())
# Plotting the token frequency bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x=list(tokens), y=list(counts), palette="viridis")
plt.xlabel('Tokens')
plt.ylabel('Frequency')
plt.title('Token Frequencies')
plt.xticks(rotation=45)
plt.show()
# Display token frequencies and visualize for Treebank tokenizer
displayTokenFrequency(treebank_tokens)