# coding: utf-8
from import csr_matrix
docs = [["hello", "world", "hello"], ["goodbye", "cruel", "world"]]
indptr = [0] # Stores the row offsets
indices = [] # stores the column number of the element in data (column numbers can be repeated)
data = [] # Stores non-zero data elements
vocabulary = {} # key is the word vocabulary, value is the column number
for d in docs: # Iterate over each document
for term in d: # Iterate over each vocabulary term of the document
# setdefault If the term does not exist, then the new term and his column
# number len(vocabulary) to add to the dictionary, return his number;
# If term exists, leave it unfilled and return the number that already exists
index = (term, len(vocabulary))
(index)
(1)
(len(indices))
# csr_matrix can sum the number of times the same term is used
csr_matrix((data, indices, indptr), dtype=int).toarray()