Read before copying: -------------------- * This is NOT a python file, it only contains partial python code. * Copy the code of the function you need and paste it in the respective function body in 'hw06_word_similarity/cooccurrence.py' * Imports are already present in 'hw06_word_similarity/cooccurrence.py' def cooccurrences(tokens, n, vocab): cooc_dict = defaultdict(int) size = n + 1 for i, f_middle_word in enumerate(tokens): # forward (right windows) if f_middle_word in vocab: f_context = tokens[0 + i + 1:i + size] for context_word in f_context: if context_word not in vocab: continue cooc_dict[(f_middle_word, context_word)] += 1 # backward (left windows) b_middle_word = tokens[-i - 1] if b_middle_word not in vocab: continue b_context = tokens[-i - size:-i - 1] for context_word in b_context: if context_word not in vocab: continue cooc_dict[(b_middle_word, context_word)] += 1 return cooc_dict def cooc_dict_to_matrix(cooc_dict, vocab): word_to_id = {w: i for i, w in enumerate(sorted(vocab))} m = lil_matrix((len(vocab), len(vocab))) for (w1, w2), count in cooc_dict.items(): m[word_to_id[w1], word_to_id[w2]] = count return m, word_to_id def ppmi_weight(cooc_matrix): sum_total = cooc_matrix.sum() sum_in_col = cooc_matrix.sum(0).A1 sum_in_row = cooc_matrix.sum(1).A1 ppmi_matrix = lil_matrix(cooc_matrix.shape) rows, cols = cooc_matrix.nonzero() for row, col in zip(rows, cols): prc = cooc_matrix[row, col] pr = sum_in_row[row] pc = sum_in_col[col] ppmi = math.log(prc) + math.log(sum_total) - math.log(pr) - math.log(pc) if ppmi > 0: ppmi_matrix[row, col] = ppmi return ppmi_matrix