import sys, re, urllib
JP_TOKEN = re.compile(u"[一-龠]+|[ぁ-ん]+|[ァ-ヴ]+|[a-zA-Z0-9]+")
class Document:
def __init__(self, docid):
self.docid = docid
self.nwords = 0
self.tf = {}
return
def __repr__(self):
return '<Document: docid=%s, nwords=%d>' % (self.docid, self.nwords)
def add_string(self, s):
for word in JP_TOKEN.findall(s):
if not (word in self.tf):
self.tf[word] = 0
self.tf[word] += 1
self.nwords += 1
return
def get_word_probability(self, word):
try:
return self.tf[word] / float(self.nwords)
except KeyError:
return 0.0
def readdoc(fp, docid):
doc = Document(docid)
while True:
line = unicode(fp.readline())
if not line: break
doc.add_string(line.strip())
return doc
if __name__ == "__main__":
(url, word) = (sys.argv[1], sys.argv[2])
word = unicode(word)
print "Reading...", url
fp = urllib.urlopen(url)
doc = readdoc(fp, url)
fp.close()
print "Document:", doc
print "Probability of '%s' = %.6f" % (word, doc.get_word_probability(word))