sample.py

#!/usr/bin/env python
# -*- encoding: euc-jp -*-

# モジュール sys と re を使用する。
import sys, re, urllib

# 日本語トークンを切り出すための正規表現。
JP_TOKEN = re.compile(u"[一-龠]+|[ぁ-ん]+|[ァ-ヴ]+|[a-zA-Z0-9]+")


##  Document - 文書オブジェクトの定義
##
class Document:

  # コンストラクタ: Document(docid)
  def __init__(self, docid):
    # docid: ドキュメントID
    self.docid = docid
    # nwords: ドキュメント中の単語数
    self.nwords = 0
    # tf: 単語から TF への写像
    self.tf = {}
    return

  # デバッグ用表示ルーチン
  def __repr__(self):
    return '<Document: docid=%s, nwords=%d>' % (self.docid, self.nwords)

  # add_string(s): ドキュメントに文字列sを追加する
  def add_string(self, s):
    # 与えられた文字列 s の中にあるトークンそれぞれについて
    for word in JP_TOKEN.findall(s):
      # TF を 1 増やす。
      if not (word in self.tf):
        self.tf[word] = 0
      self.tf[word] += 1
      # 単語数を 1 増やす。
      self.nwords += 1
    return

  # get_word_probability(word): ある単語の現れる確率を得る
  def get_word_probability(self, word):
    try:
      return self.tf[word] / float(self.nwords)
    except KeyError:
      return 0.0


# readdoc(fp,docid):
# 与えられたファイルオブジェクトを読みこみ Document オブジェクトを作成
def readdoc(fp, docid):
  doc = Document(docid)
  while True:
    line = unicode(fp.readline())
    # ファイルの末尾まで
    if not line: break
    doc.add_string(line.strip())
  return doc


# test
if __name__ == "__main__":
  # コマンドライン引数
  (url, word) = (sys.argv[1], sys.argv[2])
  word = unicode(word)
  print "Reading...", url
  fp = urllib.urlopen(url)
  doc = readdoc(fp, url)
  fp.close()
  print "Document:", doc
  print "Probability of '%s' = %.6f" % (word, doc.get_word_probability(word))