-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsearch.py
More file actions
164 lines (137 loc) · 5.68 KB
/
Copy pathsearch.py
File metadata and controls
164 lines (137 loc) · 5.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# -*- coding: utf-8 -*-
import math
import random
from collections import defaultdict, OrderedDict
from data import Vector
class TFIDF(object):
def __init__(self, keywords, documents, cleaner):
self.keywords = None
self.keywords_lookup = None
self.documents_count = None
self.documents = None
self.document_vectors = None
self.documents_tfidfs = {}
self.cleaner = cleaner
self._setup_keywords(keywords)
self._setup_documents(documents)
self._setup_documents_count()
self._setup_documents_tfidfs()
def _setup_keywords(self, keywords):
self.keywords = OrderedDict()
self.keywords_lookup = OrderedDict()
cleaned = self.cleaner.clean_wordlist(keywords)
cleaned = set(cleaned)
for i, word in enumerate(sorted(cleaned)):
self.keywords[word] = i
self.keywords_lookup[i] = word
def _setup_documents(self, documents):
self.documents = documents
self.document_vectors = {}
for index, document in enumerate(documents):
cleaned = self.cleaner.clean_wordlist(
document[0].split() + document[1].split())
vector = self.wordlist_to_vector(cleaned)
self.document_vectors[index] = vector
def _setup_documents_tfidfs(self):
for i in range(len(self.documents)):
vector = self.document_vectors[i]
tfidfs = self.tfidf(vector)
self.documents_tfidfs[i] = tfidfs
def _setup_documents_count(self):
self.documents_count = defaultdict(int)
for keyword, i in self.keywords.iteritems():
for document_vector in self.document_vectors.itervalues():
if document_vector[i] > 0:
self.documents_count[keyword] += 1
def wordlist_to_vector(self, wordlist):
significant = [word for word in wordlist if word in self.keywords]
wordcount = defaultdict(int)
for word in significant:
wordcount[word] += 1
vector = [wordcount[word] for word in self.keywords.iterkeys()]
return vector
def search(self, question):
question_vector = self.phrase_to_vector(question)
question_tfidfs = self.tfidf(question_vector)
ranking = {}
for i in range(len(self.documents)):
ranking[i] = self.doc_phrase_similarity(i, question_tfidfs)
results = [(self.documents[item[0]][0], item[1], item[0])
for item in sorted(ranking.items(),
key=lambda t: t[1], reverse=True) if item[1] > 0]
return results
def phrase_to_vector(self, phrase):
phrase_words = phrase.split()
phrase_clean = self.cleaner.clean_wordlist(phrase_words)
return self.wordlist_to_vector(phrase_clean)
def doc_phrase_similarity(self, doc_index, question_tfidfs):
return Vector.similarity(
self.tfidf_by_doc_index(doc_index), question_tfidfs)
def doc_doc_similarity(self, doc1_index, doc2_index):
return Vector.similarity(
self.tfidf_by_doc_index(doc1_index), self.tfidf_by_doc_index(doc2_index))
def tfidf_by_doc_index(self, i):
return self.documents_tfidfs[i]
def tfidf(self, document):
tfs = [self.tf(document, word) for word in self.keywords.iterkeys()]
idfs = [self.idf(word) for word in self.keywords.iterkeys()]
tfidfs = [float(tf * idf) for tf, idf in zip(tfs, idfs)]
return tfidfs
def tf(self, document, term):
term_index = self.keywords[term]
n = document[term_index]
if n == 0:
return 0
else:
return float(n) / float(max(document))
def idf(self, term):
n = self.documents_count[term]
if n == 0:
return 0
else:
return math.log(float(len(self.documents)) / float(n), 10)
def get_term_document_matrix(self):
result = []
for i in xrange(len(self.documents_tfidfs)):
result.append(self.documents_tfidfs[i])
return result
def group_kmeans(self, k, max_iters):
centroids_tfidfs = random.sample(self.documents_tfidfs.values(), k)
i = 0;
changed = True
groups = [[doc_id for doc_id in range(len(self.documents))]]
old_groups = []
while i < max_iters and old_groups != groups:
old_groups = groups
groups = self.assign_to_groups(centroids_tfidfs)
centroids_tfidfs = self.centroids(groups)
return groups
def assign_to_groups(self, centroids_tfidfs):
groups = []
for i in range(len(centroids_tfidfs)):
groups.append([])
for doc_id in range(len(self.documents)):
best = -1;
for i, centroid_tfidf in enumerate(centroids_tfidfs):
sim = self.doc_phrase_similarity(doc_id, centroid_tfidf)
if sim > best:
best = sim
group = i
groups[group].append(doc_id)
return groups
def centroids(self, groups):
centroids = []
for group in groups:
centroid = self.centroid(group)
centroids.append(centroid)
return centroids
def centroid(self, group):
centroid = []
for doc_id in group:
doc_tfidf = self.documents_tfidfs[doc_id]
for i, value in enumerate(doc_tfidf):
if i >= len(centroid):
centroid.append(0)
centroid[i] += value
centroid = [value / len(group) for value in centroid]
return centroid