-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathModel_Comparison.py
More file actions
128 lines (100 loc) · 4.28 KB
/
Model_Comparison.py
File metadata and controls
128 lines (100 loc) · 4.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim import corpora
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import TfidfModel
import nltk
import pyLDAvis.gensim_models
def Compute_Coherence_BERTopic():
df = pd.read_json(
"hf://datasets/Amod/mental_health_counseling_conversations/combined_dataset.json", lines=True)
contexts = df['Context']
# Create BERTopic model
topic_model = BERTopic(top_n_words=25)
topics, probs = topic_model.fit_transform(contexts)
print(f'Number of topics = {len(set(topics))}')
# Visualize the topics
fig = topic_model.visualize_topics()
fig.write_html("bertopic_visualization.html")
# Prepare the documents for Gensim coherence calculation
# Split your documents into tokens
texts = [doc.split() for doc in contexts]
# Create a Gensim Dictionary and Corpus
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# Get topic words
topic_words = topic_model.get_topics()
topics = [[word for word, _ in topic_words[topic]]
for topic in topic_words]
coherence_model = CoherenceModel(
topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
# Get the coherence score
coherence_score = coherence_model.get_coherence()
print("Coherence Score:", coherence_score)
def Compute_Coherence_LDA(from_n_topics, to_n_topics):
# Load pretrained Sentence-BERT model.
encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
df = pd.read_json(
"hf://datasets/Amod/mental_health_counseling_conversations/combined_dataset.json", lines=True)
contexts = df['Context'].tolist()
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
custom_stop_words = stop_words
# Preprocess questions
processed_questions = []
for context in contexts:
tokens = word_tokenize(context.lower())
base_words = [lemmatizer.lemmatize(token)
for token in tokens if token.isalpha()]
filtered_base_words = [
word for word in base_words if word not in custom_stop_words]
processed_questions.append(filtered_base_words)
# Create dictionary of processed questions
vocabulary = corpora.Dictionary(processed_questions)
# Represent each question as bag of words
corpus_bow = [vocabulary.doc2bow(question)
for question in processed_questions]
# Train the TF-IDF model
tfidf = TfidfModel(corpus_bow)
# Transform the BoW corpus to a TF-IDF corpus
corpus_tfidf = tfidf[corpus_bow]
# Model Tuning
best_model = None
best_coherence_score = -1
score_dict = {}
for n_topics in range(from_n_topics, to_n_topics):
lda_model = LdaModel(corpus_tfidf,
num_topics=n_topics,
id2word=vocabulary,
alpha=0.01,
eta=0.01,
passes=20,
random_state=42
)
perplexity_score = lda_model.log_perplexity(corpus_tfidf)
coherence_model_lda = CoherenceModel(
model=lda_model, texts=processed_questions, dictionary=vocabulary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
if (coherence_score > best_coherence_score):
best_coherene_score = coherence_score
best_model = lda_model
score_dict[n_topics] = [perplexity_score, coherence_score]
lda_model = best_model
print(score_dict)
# Visualize the topics
vis = pyLDAvis.gensim_models.prepare(
lda_model, corpus_tfidf, vocabulary)
pyLDAvis.save_html(vis, 'lda_visualization.html')
if __name__ == '__main__':
Compute_Coherence_BERTopic()
# Change the from and to range based on requirements.
# We already know the best number of topics are 7.
Compute_Coherence_LDA(from_n_topics=7, to_n_topics=8)