# Prepare data
PATH = "C:\\Users\\mat\\Desktop\data\\Key phrase extraction of each stage\\stage1_Key phrase.txt"
file_object2=open(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n') #Line-by-line reading content
data_set=[] #Create a list of stored word segments
for i in range(len(file_object2)):
result=[]
seg_list = file_object2[i].split()
for w in seg_list :#Read each line of participle
result.append(w)
data_set.append(result)
# print(data_set)
dictionary = corpora.Dictionary(data_set) # Build document-term matrix
corpus = [dictionary.doc2bow(text) for text in data_set]
#Lda = # Create LDA object
#Calculate confusion
def perplexity(num_topics):
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30)
# print(ldamodel.print_topics(num_topics=num_topics, num_words=15))
# print(ldamodel.log_perplexity(corpus))
return ldamodel.log_perplexity(corpus)
#Calculate coherence
def coherence(num_topics):
ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30,random_state = 1)
# print(ldamodel.print_topics(num_topics=num_topics, num_words=10))
ldacm = CoherenceModel(model=ldamodel, texts=data_set, dictionary=dictionary, coherence='c_v')
# print(ldacm.get_coherence())
return ldacm.get_coherence()
# Draw a confusion line chart
x = range(1,15)
# z = [perplexity(i) for i in x]
y = [coherence(i) for i in x]
plt.plot(x, y)
plt.xlabel('Number of topics')
plt.ylabel('coherence size')
plt.rcParams['-serif']=['SimHei']
matplotlib.rcParams['axes.unicode_minus']=False
plt.title('Theme-coherence changes')
plt.show()