Goals | Examples |
Understanding public opinion about a studied issue | 1. Title: "Restoring reputation through digital diplomacy…" Goal: identify which narratives appeared at different stages of a crisis Sample: k = 12,935 2. Title: "Analytical framework for evaluating digital diplomacy using network analysis and topic modeling: comparing South Korea and Japan" Goal: identify the consequences of violent protest tactics Sample: k = 9,659,770 |
Identifying dominant themes in discussion | 1. Title: "Analytical framework for evaluating digital diplomacy using network analysis and topic modeling: comparing South Korea and Japan" / "Аналитическая структура для оценки цифровой дипломатии с использованием сетевого анализа и тематического моделирования: сравнение Южной Кореи и Японии" Goal: identify fans'/tourists' attitudes toward Korea and … Sample: KOCIS posts (k = 16,962) and JNTO posts (k = 8,278) |
dictionary.filter_extremes(no_below=5, no_above=0.5) # Creating bigrams
bigram_model = Phrases(data['cleaned_abstract'], min_count=5, threshold=10)
data['bigrams'] = data['cleaned_abstract'].apply(lambda tokens: bigram_model[tokens])
# Creating dictionary and corpus
dictionary = corpora.Dictionary(data['bigrams'])
corpus = [dictionary.doc2bow(text) for text in data['bigrams']]
# Calculating TF-IDF
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus] limit=10, start=2, step=1 # Function to compute coherence, perplexity, and log_likelihood for an LDA model
def compute_metrics(dictionary, corpus, texts, limit=10, start=2, step=1):
coherence_values = []
perplexity_values = []
log_likelihood_values = []
model_list = []
num_topics_range = []
for num_topics in range(start, limit, step):
model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=150)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
perplexity_values.append(-model.log_perplexity(corpus))
log_likelihood_values.append(model.bound(corpus))
num_topics_range.append(num_topics)
return model_list, coherence_values, perplexity_values, log_likelihood_values, num_topics_range
# Parameters for selecting the number of topics
start = 2
limit = 6 # maximum number of topics you want to test
step = 1
if limit < start:
print("Error: limit must be greater than or equal to start")
exit()
if step <= 0:
print("Error: step must be greater than 0")
exit()
# Computing metrics for different values of num_topics
model_list, coherence_values, perplexity_values, log_likelihood_values, num_topics_range = compute_metrics(dictionary, corpus_tfidf, data['bigrams'], limit, start, step)
# Creating a table with results
metrics_df = pd.DataFrame({
'Num Topics': num_topics_range,
'Coherence': coherence_values,
'Perplexity': perplexity_values,
'Log Likelihood': log_likelihood_values
})
print(metrics_df)
# Displaying results
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
plt.plot(num_topics_range, coherence_values)
plt.xlabel("Number of topics")
plt.ylabel("Coherence")
plt.legend(("Coherence"), loc='best')
plt.subplot(1, 3, 2)
plt.plot(num_topics_range, perplexity_values)
plt.xlabel("Number of topics")
plt.ylabel("Perplexity")
plt.legend(("Perplexity"), loc='best')
plt.subplot(1, 3, 3)
plt.plot(num_topics_range, log_likelihood_values)
plt.xlabel("Number of topics")
plt.ylabel("Log Likelihood")
plt.legend(("Log Likelihood"), loc='best')
plt.tight_layout()
plt.show()
# Outputting optimal numbers of topics according to all metrics
optimal_num_topics_coherence = num_topics_range[coherence_values.index(max(coherence_values))]
optimal_num_topics_perplexity = num_topics_range[perplexity_values.index(max(perplexity_values))]
optimal_num_topics_log_likelihood = num_topics_range[log_likelihood_values.index(max(log_likelihood_values))]
print(f'Optimal number of topics by coherence: {optimal_num_topics_coherence}')
print(f'Optimal number of topics by perplexity: {optimal_num_topics_perplexity}')
print(f'Optimal number of topics by log likelihood: {optimal_num_topics_log_likelihood}')
# Install required libraries import nltk # Import the function for tokenization from nltk.tokenize import word_tokenize # Download data for tokenization nltk.download('punkt_tab') # Source text – replace with your own raw = "Куда выйдешь, туда и попадешь" # Word tokenization tokens = word_tokenize(raw, language='russian') # Specify 'russian' for the Russian language print("\nTokenized words:") print(tokens) |
import string # Source text text = "Куда выйдешь, туда и попадешь" # Removing punctuation using a generator cleaned_text = "".join (l for l in text if l not in string. punctuation) print (cleaned_text) # Removing punctuation using regular expressions import re cleaned_text_regex = re. sub ("[^\w\s]", "", text) print (cleaned_text_regex) |
!pip install pymorphy2 from pymorphy2 import MorphAnalyzer !pip install pymystem3 from pymystem3 import Mystem # Create an analyzer instance for pymorphy2 morph = MorphAnalyzer() # Convert tokens to their normal form using pymorphy2 lemmas_pymorphy = [morph.parse(token)[0].normal_form for token in tokens]print("Normalized forms (pymorphy2):", lemmas_pymorphy) # Create an analyzer instance for Mystem mystem = Mystem() # Convert tokens to their normal form using Mystem lemmas_mystem = [mystem.lemmatize(token)[0] for token in tokens]print("Normalized forms (Mystem):", lemmas_mystem) |