1. Implementation of Text Preprocessing with NLTK
Tokenization, Stemming, Lemmatization, and Removal of Stopwords:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Sample text
text = "NLTK is a powerful library for text preprocessing in NLP tasks."
# Tokenization
tokens = word_tokenize(text)
# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
# Removing Stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in tokens if word.lower() not in stop_words]
print("Tokens:", tokens)
print("Stemmed Words:", stemmed_words)
print("Lemmatized Words:", lemmatized_words)
print("Filtered Words:", filtered_words)
2. Convert Text to Word Count Vectors with Scikit-Learn
Using CountVectorizer
:
from sklearn.feature_extraction.text import CountVectorizer
# Sample text data
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?'
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("Feature Names:", vectorizer.get_feature_names_out())
print("Word Count Vectors:\n", X.toarray())
3. Convert Text to Word Frequency Vectors with Scikit-Learn
Using TfidfVectorizer
:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?'
]
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)
print("TF-IDF Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Vectors:\n", X_tfidf.toarray())
4. Convert Text to Unique Integers with Scikit-Learn
Using HashingVectorizer
:
from sklearn.feature_extraction.text import HashingVectorizer
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?'
]
hashing_vectorizer = HashingVectorizer(n_features=10)
X_hash = hashing_vectorizer.fit_transform(corpus)
print("Hashing Vectors:\n", X_hash.toarray())
5. Using Keras to Split Words
Using text_to_word_sequence
:
from keras.preprocessing.text import text_to_word_sequence
# Sample text
text = "Keras is a powerful deep learning library."
# Split words
words = text_to_word_sequence(text)
print("Words:", words)
6. Encoding with Keras one_hot
from keras.preprocessing.text import one_hot
# Sample text
text = "Keras is a great library for deep learning."
# Vocabulary size
vocab_size = 50
# One-hot encode
encoded_words = one_hot(text, vocab_size)
print("Encoded words:", encoded_words)
7. Hash Encoding with Keras hashing_trick
from keras.preprocessing.text import hashing_trick
# Sample text
text = "Keras is an amazing library for NLP."
# Hash Encoding
vocab_size = 50
hashed_words = hashing_trick(text, vocab_size, hash_function='md5')
print("Hashed words:", hashed_words)
8. Demo of Keras Tokenizer
API
from keras.preprocessing.text import Tokenizer
# Sample corpus
corpus = [
'Keras is a deep learning library.',
'It helps in NLP and other machine learning tasks.',
'We are learning text processing with Keras.'
]
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
# Word index
word_index = tokenizer.word_index
print("Word Index:", word_index)
# Convert to sequences
sequences = tokenizer.texts_to_sequences(corpus)
print("Text Sequences:", sequences)
9. Sentiment Analysis Experiment
You can perform sentiment analysis on a dataset like Twitter tweets or movie reviews. Here's an example using the nltk
movie_reviews
dataset:
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
nltk.download('movie_reviews')
# Extract features
def extract_features(words):
return {word: True for word in words}
# Prepare data
positive_reviews = [(extract_features(movie_reviews.words(fileid)), 'pos') for fileid in movie_reviews.fileids('pos')]
negative_reviews = [(extract_features(movie_reviews.words(fileid)), 'neg') for fileid in movie_reviews.fileids('neg')]
# Split dataset into training and testing
train_data = positive_reviews[:800] + negative_reviews[:800]
test_data = positive_reviews[800:] + negative_reviews[800:]
# Train Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_data)
# Evaluate
print("Accuracy:", accuracy(classifier, test_data))
# Show most informative features
classifier.show_most_informative_features(10)
10. Word2Vec Embedding with Gensim
from gensim.models import Word2Vec
# Sample sentences
sentences = [
['this', 'is', 'the', 'first', 'sentence'],
['this', 'is', 'another', 'sentence'],
['word2vec', 'is', 'cool']
]
# Train Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
# Check word vector
vector = model.wv['sentence']
print("Vector for 'sentence':", vector)
# Check most similar words
similar_words = model.wv.most_similar('sentence')
print("Most similar to 'sentence':", similar_words)