Applying ML concept on my own data
2024-02-08
So I just got off class and want to fiddle around with the concept i just learned: bag of words, term document matrix, context vector and word embedding.
So I trained a model using the text from here. Here is the code.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
= """
d_Making_it_some_what_dynamic Making it some what dynamic..."""
= """
d_Making_it_properly_dynamic Making it properly dynamic..."""
= """
d_detection_script_debug What's new detection script debug and fix..."""
= """
d_Website_server_migrationWebsite server migration..."""
= """
d_Creating_mdbook_summary_maker Creating mdbook-summary-maker - a summary.md automation tool..."""
= r"""
d_creating_this_website Creating this website..."
""
= [d_Creating_mdbook_summary_maker, d_creating_this_website, d_detection_script_debug, d_Making_it_properly_dynamic, d_Making_it_some_what_dynamic, d_Website_server_migration] documents
= CountVectorizer(stop_words="english")
vrizer
vrizer.fit(documents)= vrizer.transform(documents)
X print(X.shape)
= pd.DataFrame(X.toarray(),
dtm =vrizer.get_feature_names_out())
columns dtm
(6, 909)
00 | 02 | 05 | 07 | 10 | 12 | 16 | 17 | 200 | 2020 | ... | wraped | write | writing | written | www | wwwroot | xxxx | year | yes | your_website_url | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 1 | 3 | 0 | 3 | 2 | 0 | 1 | 1 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 |
6 rows × 909 columns
= cosine_similarity(dtm)
cosine_sim_matrix_for_antzed
cosine_sim_matrix_for_antzed.shape = [
document_names "d_Creating_mdbook_summary_maker",
"d_creating_this_website",
"d_detection_script_debug",
"d_Making_it_properly_dynamic",
"d_Making_it_some_what_dynamic",
"d_Website_server_migration"
]
# Initialize variables to store the maximum similarity and the document indices
= 0
max_similarity = 0
doc_index_1 = 0
doc_index_2
# Iterate over the matrix to find the pair with the highest similarity
for i in range(6):
for j in range(i+1, 6): # Ensure no self-comparison
if cosine_sim_matrix_for_antzed[i, j] > max_similarity:
= cosine_sim_matrix_for_antzed[i, j]
max_similarity = i
doc_index_1 = j
doc_index_2
# Print the document names with the highest similarity and their similarity score
print(f"The documents with the highest similarity are \"{document_names[doc_index_1]}\" and \"{document_names[doc_index_2]}\" with a similarity score of {max_similarity:.4f}.")
The documents with the highest similarity are "d_detection_script_debug" and "d_Making_it_some_what_dynamic" with a similarity score of 0.4253.
import gensim
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
'wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download(
= set(stopwords.words('english'))
stop_words = WordNetLemmatizer() lemmatizer
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data] Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /home/jovyan/nltk_data...
[nltk_data] Package averaged_perceptron_tagger is already up-to-
[nltk_data] date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string
def get_wordnet_pos(word):
"""Map NLTK's part of speech tags to wordnet's"""
= nltk.pos_tag([word])[0][1][0].upper()
tag = {"J": wordnet.ADJ,
tag_dict "N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
def preprocess_text(text):
= simple_preprocess(text, deacc=True) # Tokenize into words
tokens = [token for token in tokens if token not in stop_words]
tokens_no_stops = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens_no_stops]
lemmatized_tokens return lemmatized_tokens
# Tokenize and preprocess all documents
= [preprocess_text(doc) for doc in documents]
processed_sentences
# Flatten the list if your model expects a list of sentences as input
= [word for sublist in processed_sentences for word in sublist] flat_list_of_words
print(flat_list_of_words[:5])
['create', 'mdbook', 'summary', 'maker', 'summary']
from gensim.models import Word2Vec
# Now that we have our sentences preprocessed and lemmatized, we train the model
= Word2Vec([flat_list_of_words], vector_size=100, window=5, min_count=1, workers=4)
model
# Save the model for later use
"lemmatized_word2vec_model.model") model.save(
'rust', topn=10) model.wv.most_similar(
[('ssh', 0.30321794748306274),
('target', 0.28839021921157837),
('enviroment', 0.28243470191955566),
('examine', 0.2718653678894043),
('succeful', 0.26447442173957825),
('touppercase', 0.2616710662841797),
('assist', 0.2575630247592926),
('true', 0.25681009888648987),
('familiar', 0.25594884157180786),
('doen', 0.2559070289134979)]
'dynamic', topn=10) model.wv.most_similar(
[('top', 0.25962546467781067),
('variable', 0.2449072003364563),
('semi', 0.23804989457130432),
('later', 0.23783016204833984),
('could', 0.23667342960834503),
('lastindexof', 0.22947242856025696),
('indeed', 0.22467540204524994),
('easy', 0.22087810933589935),
('detection', 0.21729730069637299),
('directory', 0.21623274683952332)]
'website', topn=10) model.wv.most_similar(
[('avaiable', 0.3144473731517792),
('fail', 0.3111109435558319),
('probably', 0.3086855113506317),
('wm', 0.306361585855484),
('host', 0.28776368498802185),
('save', 0.2712177634239197),
('backup', 0.26919832825660706),
('code', 0.2683839201927185),
('folder', 0.2653118073940277),
('etc', 0.2603006362915039)]
'antzed', topn=10) model.wv.most_similar(
[('in_file_path', 0.3186494708061218),
('template', 0.30744123458862305),
('relationship', 0.29583120346069336),
('port', 0.295200377702713),
('filename', 0.2635174095630646),
('allows', 0.24324464797973633),
('preprocesor', 0.2392539530992508),
('item', 0.2337876856327057),
('treat', 0.22798201441764832),
('malfunction', 0.22339922189712524)]
'problem', topn=10) model.wv.most_similar(
[('empty', 0.25634294748306274),
('us', 0.24955253303050995),
('template', 0.2491423785686493),
('junk', 0.24703580141067505),
('filter', 0.2332962602376938),
('bash', 0.23113256692886353),
('enter', 0.22585271298885345),
('sign', 0.2233218401670456),
('node', 0.21544311940670013),
('website', 0.21240629255771637)]
We can see that the detection script debugging post has the highest similarity to making it somewhat dynamic post, which was surprising.
We can also see that some of the words, such as 'rust' is used along side 'ssh'. Although it seems like there aren't a lot of close context between words in my blogs.