Applying ML concept on my own data

2024-02-08

So I just got off class and want to fiddle around with the concept i just learned: bag of words, term document matrix, context vector and word embedding.

So I trained a model using the text from here. Here is the code.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
d_Making_it_some_what_dynamic = """
Making it some what dynamic..."""
d_Making_it_properly_dynamic = """
Making it properly dynamic..."""
d_detection_script_debug  = """
What's new detection script debug and fix..."""
d_Website_server_migration= """
Website server migration..."""
d_Creating_mdbook_summary_maker = """
Creating mdbook-summary-maker - a summary.md automation tool..."""
d_creating_this_website = r"""
Creating this website..."
""
documents = [d_Creating_mdbook_summary_maker, d_creating_this_website, d_detection_script_debug, d_Making_it_properly_dynamic, d_Making_it_some_what_dynamic, d_Website_server_migration]
vrizer = CountVectorizer(stop_words="english")
vrizer.fit(documents)
X = vrizer.transform(documents)
print(X.shape)
dtm = pd.DataFrame(X.toarray(),
        	columns=vrizer.get_feature_names_out())
dtm
(6, 909)
00 02 05 07 10 12 16 17 200 2020 ... wraped write writing written www wwwroot xxxx year yes your_website_url
0 0 0 1 0 0 1 0 0 0 0 ... 0 3 2 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 2 0 0 0 0 0 0 0 0
2 1 3 0 3 2 0 1 1 0 0 ... 0 1 0 0 0 0 0 0 0 0
3 0 0 1 0 0 1 0 0 1 0 ... 1 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
5 0 0 0 1 0 1 0 0 0 1 ... 0 0 0 0 1 1 1 1 1 1

6 rows × 909 columns

cosine_sim_matrix_for_antzed = cosine_similarity(dtm)
cosine_sim_matrix_for_antzed.shape 
document_names = [
    "d_Creating_mdbook_summary_maker",
    "d_creating_this_website",
    "d_detection_script_debug",
    "d_Making_it_properly_dynamic",
    "d_Making_it_some_what_dynamic",
    "d_Website_server_migration"
]
# Initialize variables to store the maximum similarity and the document indices
max_similarity = 0
doc_index_1 = 0
doc_index_2 = 0

# Iterate over the matrix to find the pair with the highest similarity
for i in range(6):
    for j in range(i+1, 6):  # Ensure no self-comparison
        if cosine_sim_matrix_for_antzed[i, j] > max_similarity:
            max_similarity = cosine_sim_matrix_for_antzed[i, j]
            doc_index_1 = i
            doc_index_2 = j

# Print the document names with the highest similarity and their similarity score
print(f"The documents with the highest similarity are \"{document_names[doc_index_1]}\" and \"{document_names[doc_index_2]}\" with a similarity score of {max_similarity:.4f}.")
The documents with the highest similarity are "d_detection_script_debug" and "d_Making_it_some_what_dynamic" with a similarity score of 0.4253.
import gensim
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string

def get_wordnet_pos(word):
    """Map NLTK's part of speech tags to wordnet's"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def preprocess_text(text):
    tokens = simple_preprocess(text, deacc=True)  # Tokenize into words
    tokens_no_stops = [token for token in tokens if token not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens_no_stops]
    return lemmatized_tokens

# Tokenize and preprocess all documents
processed_sentences = [preprocess_text(doc) for doc in documents]

# Flatten the list if your model expects a list of sentences as input
flat_list_of_words = [word for sublist in processed_sentences for word in sublist]
print(flat_list_of_words[:5])
['create', 'mdbook', 'summary', 'maker', 'summary']
from gensim.models import Word2Vec

# Now that we have our sentences preprocessed and lemmatized, we train the model
model = Word2Vec([flat_list_of_words], vector_size=100, window=5, min_count=1, workers=4)

# Save the model for later use
model.save("lemmatized_word2vec_model.model")
model.wv.most_similar('rust', topn=10)
[('ssh', 0.30321794748306274),
 ('target', 0.28839021921157837),
 ('enviroment', 0.28243470191955566),
 ('examine', 0.2718653678894043),
 ('succeful', 0.26447442173957825),
 ('touppercase', 0.2616710662841797),
 ('assist', 0.2575630247592926),
 ('true', 0.25681009888648987),
 ('familiar', 0.25594884157180786),
 ('doen', 0.2559070289134979)]
model.wv.most_similar('dynamic', topn=10)
[('top', 0.25962546467781067),
 ('variable', 0.2449072003364563),
 ('semi', 0.23804989457130432),
 ('later', 0.23783016204833984),
 ('could', 0.23667342960834503),
 ('lastindexof', 0.22947242856025696),
 ('indeed', 0.22467540204524994),
 ('easy', 0.22087810933589935),
 ('detection', 0.21729730069637299),
 ('directory', 0.21623274683952332)]
model.wv.most_similar('website', topn=10)
[('avaiable', 0.3144473731517792),
 ('fail', 0.3111109435558319),
 ('probably', 0.3086855113506317),
 ('wm', 0.306361585855484),
 ('host', 0.28776368498802185),
 ('save', 0.2712177634239197),
 ('backup', 0.26919832825660706),
 ('code', 0.2683839201927185),
 ('folder', 0.2653118073940277),
 ('etc', 0.2603006362915039)]
model.wv.most_similar('antzed', topn=10)
[('in_file_path', 0.3186494708061218),
 ('template', 0.30744123458862305),
 ('relationship', 0.29583120346069336),
 ('port', 0.295200377702713),
 ('filename', 0.2635174095630646),
 ('allows', 0.24324464797973633),
 ('preprocesor', 0.2392539530992508),
 ('item', 0.2337876856327057),
 ('treat', 0.22798201441764832),
 ('malfunction', 0.22339922189712524)]
model.wv.most_similar('problem', topn=10)
[('empty', 0.25634294748306274),
 ('us', 0.24955253303050995),
 ('template', 0.2491423785686493),
 ('junk', 0.24703580141067505),
 ('filter', 0.2332962602376938),
 ('bash', 0.23113256692886353),
 ('enter', 0.22585271298885345),
 ('sign', 0.2233218401670456),
 ('node', 0.21544311940670013),
 ('website', 0.21240629255771637)]

We can see that the detection script debugging post has the highest similarity to making it somewhat dynamic post, which was surprising.

We can also see that some of the words, such as 'rust' is used along side 'ssh'. Although it seems like there aren't a lot of close context between words in my blogs.