Introduction to Deep Learning-Recommended System-Similarity Algorithm Principle

import pandas as pd from sklearn.metrics.pairwise import cosine_similarity # Step 1 Feature Selection #Transfer Files user_df = pd.read_csv("data/user information.csv") article_df = pd.read_csv("data/Article Information.csv") # print(article_df) cols_df = pd.read_csv("data/column information.csv") # print(cols_df) area_df = pd.read_csv("data/regional information.csv") # print(area_df) publish_df = pd.read_csv("data/release time information.csv") # print(publish_df) click_df = pd.read_csv("data/click information.csv") # print(click_df) #Step 2, feature preprocessing, single hot encoding cols_vec = [0 for _ in cols_df["Column Number"]] # The above statement is equivalent to # cols_vec = [] # for _ in cols_df["Column number"]: # cols_vec.append(0) area_vec = [0 for _ in area_df["Geographical Number"]] publish_vec = [0 for _ in publish_df["Publishing time number"]] click_vec = [0 for _ in click_df["Click rating number"]] # traverse each article and then process its features wenz_list = [] for i in range(len(article_df)): # Columns are single-hot coded cols_str = str(article_df["Column Number"][i]) cols_onehot_vec = cols_vec.copy() if cols_str == "No limit": cols_onehot_vec = [1 for _ in cols_vec] else: # Used in case of multiple column numbers separated by commas cols_arr = cols_str.split(",") for j in cols_arr: j = int(j) cols_onehot_vec[j-1] =1 #Local Hot Code area_str = str(article_df["Geographical Number"][i]) area_onehot_vec = area_vec.copy() if area_str == "No limit": area_onehot_vec = [1 for _ in area_vec] else: # Used in case of multiple column numbers separated by commas area_arr = area_str.split(",") for j in area_arr: j = int(j) area_onehot_vec[j - 1] = 1 # Release time grading single hot code publish_str = str(article_df["Release time rating"][i]) publish_onehot_vec = publish_vec.copy() if publish_str == "No limit": publish_onehot_vec = [1 for _ in publish_vec] else: # Used in case of multiple column numbers separated by commas publish_arr = publish_str.split(",") for j in publish_arr: j = int(j) publish_onehot_vec[j - 1] = 1 # Click count grading single hot code click_str = str(article_df["Click rating"][i]) click_onehot_vec = click_vec.copy() if click_str == "No limit": click_onehot_vec = [1 for _ in click_vec] else: # Used in case of multiple column numbers separated by commas click_arr = click_str.split(",") for j in click_arr: j = int(j) click_onehot_vec[j - 1] = 1 # Splicing and hot code into the article list wenz_list.append( cols_onehot_vec + area_onehot_vec + publish_onehot_vec + click_onehot_vec ) # Print article information list # for v in wenz_list: # print(v) # Process user information: only column number and region number are included in the user information # 1. Release time? The latest published value = 1st grade # 2. Clicks? The value with the most clicks = 8th grade user_list = [] for i in range(len(user_df)): # The number of columns that users are interested in cols_str = str(user_df["Column number of interest"][i]) cols_onehot_vec = cols_vec.copy() if cols_str == "No limit": cols_onehot_vec = [1 for _ in cols_vec] else: # Used in case of multiple column numbers separated by commas cols_arr = cols_str.split(",") for j in cols_arr: j = int(j) cols_onehot_vec[j-1] =1 #Local Hot Code area_str = str(user_df["Geographic number to which it belongs"][i]) area_onehot_vec = area_vec.copy() if area_str == "No limit": area_onehot_vec = [1 for _ in area_vec] else: # Used in case of multiple column numbers separated by commas area_arr = area_str.split(",") for j in area_arr: j = int(j) area_onehot_vec[j - 1] = 1 # The user's hidden publishing time interests are set to the latest, that is, the first level, and are all single-hot encodings [1,0,0,0,0,0,0,0] publish_onehot_vec = publish_vec.copy() publish_onehot_vec[0] = 1 # The user's hidden click interest is set to the highest, that is, the last level, and is the only hot encoding of [0,0,0,0,0,0,0,0,1] click_onehot_vec = click_vec.copy() click_onehot_vec[-1] = 1 user_list.append( cols_onehot_vec + area_onehot_vec + publish_onehot_vec + click_onehot_vec ) # for v in user_list: # print(v) # The third step is to find the similarity between each article and the user based on the cosine similarity sim_lis = cosine_similarity(user_list,wenz_list) #Sequence in the fourth step sorted_sim_lis = [] for lis in sim_lis: # Similarity is associated with the corresponding article number lis1 = [(i+1, sim_v)for i, sim_v in enumerate(lis)] # Sort by descending order of the second column lis1 = sorted(lis1, key=lambda row: row[1], reverse= True) sorted_sim_lis.append(lis1) for i,v in enumerate(sorted_sim_lis): print(user_df["Nick name"][i]) print(v)