import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
# Step 1 Feature Selection
#Transfer Files
user_df = pd.read_csv("data/user information.csv")
article_df = pd.read_csv("data/Article Information.csv")
# print(article_df)
cols_df = pd.read_csv("data/column information.csv")
# print(cols_df)
area_df = pd.read_csv("data/regional information.csv")
# print(area_df)
publish_df = pd.read_csv("data/release time information.csv")
# print(publish_df)
click_df = pd.read_csv("data/click information.csv")
# print(click_df)
#Step 2, feature preprocessing, single hot encoding
cols_vec = [0 for _ in cols_df["Column Number"]]
# The above statement is equivalent to
# cols_vec = []
# for _ in cols_df["Column number"]:
# cols_vec.append(0)
area_vec = [0 for _ in area_df["Geographical Number"]]
publish_vec = [0 for _ in publish_df["Publishing time number"]]
click_vec = [0 for _ in click_df["Click rating number"]]
# traverse each article and then process its features
wenz_list = []
for i in range(len(article_df)):
# Columns are single-hot coded
cols_str = str(article_df["Column Number"][i])
cols_onehot_vec = cols_vec.copy()
if cols_str == "No limit":
cols_onehot_vec = [1 for _ in cols_vec]
else:
# Used in case of multiple column numbers separated by commas
cols_arr = cols_str.split(",")
for j in cols_arr:
j = int(j)
cols_onehot_vec[j-1] =1
#Local Hot Code
area_str = str(article_df["Geographical Number"][i])
area_onehot_vec = area_vec.copy()
if area_str == "No limit":
area_onehot_vec = [1 for _ in area_vec]
else:
# Used in case of multiple column numbers separated by commas
area_arr = area_str.split(",")
for j in area_arr:
j = int(j)
area_onehot_vec[j - 1] = 1
# Release time grading single hot code
publish_str = str(article_df["Release time rating"][i])
publish_onehot_vec = publish_vec.copy()
if publish_str == "No limit":
publish_onehot_vec = [1 for _ in publish_vec]
else:
# Used in case of multiple column numbers separated by commas
publish_arr = publish_str.split(",")
for j in publish_arr:
j = int(j)
publish_onehot_vec[j - 1] = 1
# Click count grading single hot code
click_str = str(article_df["Click rating"][i])
click_onehot_vec = click_vec.copy()
if click_str == "No limit":
click_onehot_vec = [1 for _ in click_vec]
else:
# Used in case of multiple column numbers separated by commas
click_arr = click_str.split(",")
for j in click_arr:
j = int(j)
click_onehot_vec[j - 1] = 1
# Splicing and hot code into the article list
wenz_list.append(
cols_onehot_vec + area_onehot_vec + publish_onehot_vec + click_onehot_vec
)
# Print article information list
# for v in wenz_list:
# print(v)
# Process user information: only column number and region number are included in the user information
# 1. Release time? The latest published value = 1st grade
# 2. Clicks? The value with the most clicks = 8th grade
user_list = []
for i in range(len(user_df)):
# The number of columns that users are interested in
cols_str = str(user_df["Column number of interest"][i])
cols_onehot_vec = cols_vec.copy()
if cols_str == "No limit":
cols_onehot_vec = [1 for _ in cols_vec]
else:
# Used in case of multiple column numbers separated by commas
cols_arr = cols_str.split(",")
for j in cols_arr:
j = int(j)
cols_onehot_vec[j-1] =1
#Local Hot Code
area_str = str(user_df["Geographic number to which it belongs"][i])
area_onehot_vec = area_vec.copy()
if area_str == "No limit":
area_onehot_vec = [1 for _ in area_vec]
else:
# Used in case of multiple column numbers separated by commas
area_arr = area_str.split(",")
for j in area_arr:
j = int(j)
area_onehot_vec[j - 1] = 1
# The user's hidden publishing time interests are set to the latest, that is, the first level, and are all single-hot encodings [1,0,0,0,0,0,0,0]
publish_onehot_vec = publish_vec.copy()
publish_onehot_vec[0] = 1
# The user's hidden click interest is set to the highest, that is, the last level, and is the only hot encoding of [0,0,0,0,0,0,0,0,1]
click_onehot_vec = click_vec.copy()
click_onehot_vec[-1] = 1
user_list.append(
cols_onehot_vec + area_onehot_vec + publish_onehot_vec + click_onehot_vec
)
# for v in user_list:
# print(v)
# The third step is to find the similarity between each article and the user based on the cosine similarity
sim_lis = cosine_similarity(user_list,wenz_list)
#Sequence in the fourth step
sorted_sim_lis = []
for lis in sim_lis:
# Similarity is associated with the corresponding article number
lis1 = [(i+1, sim_v)for i, sim_v in enumerate(lis)]
# Sort by descending order of the second column
lis1 = sorted(lis1, key=lambda row: row[1], reverse= True)
sorted_sim_lis.append(lis1)
for i,v in enumerate(sorted_sim_lis):
print(user_df["Nick name"][i])
print(v)