Wednesday, October 16, 2024

Deep Learning with NLP Laboratory Btech sem 7 [203105477] Parul University



1. Implementation of Text Preprocessing with NLTK

Tokenization, Stemming, Lemmatization, and Removal of Stopwords:


import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer, WordNetLemmatizer nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') # Sample text text = "NLTK is a powerful library for text preprocessing in NLP tasks." # Tokenization tokens = word_tokenize(text) # Stemming stemmer = PorterStemmer() stemmed_words = [stemmer.stem(word) for word in tokens] # Lemmatization lemmatizer = WordNetLemmatizer() lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens] # Removing Stopwords stop_words = set(stopwords.words('english')) filtered_words = [word for word in tokens if word.lower() not in stop_words] print("Tokens:", tokens) print("Stemmed Words:", stemmed_words) print("Lemmatized Words:", lemmatized_words) print("Filtered Words:", filtered_words)

2. Convert Text to Word Count Vectors with Scikit-Learn

Using CountVectorizer:


from sklearn.feature_extraction.text import CountVectorizer # Sample text data corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?' ] vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus) print("Feature Names:", vectorizer.get_feature_names_out()) print("Word Count Vectors:\n", X.toarray())

3. Convert Text to Word Frequency Vectors with Scikit-Learn

Using TfidfVectorizer:


from sklearn.feature_extraction.text import TfidfVectorizer corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?' ] tfidf_vectorizer = TfidfVectorizer() X_tfidf = tfidf_vectorizer.fit_transform(corpus) print("TF-IDF Feature Names:", tfidf_vectorizer.get_feature_names_out()) print("TF-IDF Vectors:\n", X_tfidf.toarray())

4. Convert Text to Unique Integers with Scikit-Learn

Using HashingVectorizer:


from sklearn.feature_extraction.text import HashingVectorizer corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?' ] hashing_vectorizer = HashingVectorizer(n_features=10) X_hash = hashing_vectorizer.fit_transform(corpus) print("Hashing Vectors:\n", X_hash.toarray())

5. Using Keras to Split Words

Using text_to_word_sequence:


from keras.preprocessing.text import text_to_word_sequence # Sample text text = "Keras is a powerful deep learning library." # Split words words = text_to_word_sequence(text) print("Words:", words)

6. Encoding with Keras one_hot


from keras.preprocessing.text import one_hot # Sample text text = "Keras is a great library for deep learning." # Vocabulary size vocab_size = 50 # One-hot encode encoded_words = one_hot(text, vocab_size) print("Encoded words:", encoded_words)

7. Hash Encoding with Keras hashing_trick


from keras.preprocessing.text import hashing_trick # Sample text text = "Keras is an amazing library for NLP." # Hash Encoding vocab_size = 50 hashed_words = hashing_trick(text, vocab_size, hash_function='md5') print("Hashed words:", hashed_words)

8. Demo of Keras Tokenizer API


from keras.preprocessing.text import Tokenizer # Sample corpus corpus = [ 'Keras is a deep learning library.', 'It helps in NLP and other machine learning tasks.', 'We are learning text processing with Keras.' ] # Tokenization tokenizer = Tokenizer() tokenizer.fit_on_texts(corpus) # Word index word_index = tokenizer.word_index print("Word Index:", word_index) # Convert to sequences sequences = tokenizer.texts_to_sequences(corpus) print("Text Sequences:", sequences)

9. Sentiment Analysis Experiment

You can perform sentiment analysis on a dataset like Twitter tweets or movie reviews. Here's an example using the nltk movie_reviews dataset:


import nltk from nltk.corpus import movie_reviews from nltk.classify import NaiveBayesClassifier from nltk.classify.util import accuracy nltk.download('movie_reviews') # Extract features def extract_features(words): return {word: True for word in words} # Prepare data positive_reviews = [(extract_features(movie_reviews.words(fileid)), 'pos') for fileid in movie_reviews.fileids('pos')] negative_reviews = [(extract_features(movie_reviews.words(fileid)), 'neg') for fileid in movie_reviews.fileids('neg')] # Split dataset into training and testing train_data = positive_reviews[:800] + negative_reviews[:800] test_data = positive_reviews[800:] + negative_reviews[800:] # Train Naive Bayes classifier classifier = NaiveBayesClassifier.train(train_data) # Evaluate print("Accuracy:", accuracy(classifier, test_data)) # Show most informative features classifier.show_most_informative_features(10)

10. Word2Vec Embedding with Gensim


from gensim.models import Word2Vec # Sample sentences sentences = [ ['this', 'is', 'the', 'first', 'sentence'], ['this', 'is', 'another', 'sentence'], ['word2vec', 'is', 'cool'] ] # Train Word2Vec model model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4) # Check word vector vector = model.wv['sentence'] print("Vector for 'sentence':", vector) # Check most similar words similar_words = model.wv.most_similar('sentence') print("Most similar to 'sentence':", similar_words)