2012年3月6日火曜日

find2ch

2hcの記事検索結果を返すapiを作ろうとgoogleの検索apiを調べてみたら何だか使えないので、find.2ch.netをhtmlパースしてthread一覧を抽出するスクリプトを書いた。
# 例外処理は省略

#!/usr/bin/ruby
# -*- coding: utf-8 -*-

$KCODE = 'u'
 
require 'rubygems'
require 'open-uri'
require 'nkf'
require 'uri'

require 'pp'

SEARCH_URL = 'http://find.2ch.net'

class ThreadModel
  attr_accessor :url, :dat, :thread_number, :title, :current_cursor, :board_url, :board_name
  def initialize(url, dat, thread_number, title, current_cursor, board_url, board_name)
    @url = url
    @dat = dat
    @thread_number = thread_number
    @title = title
    @current_cursor = current_cursor
    @board_url = board_url
    @board_name = board_name
  end
end

def search2chThread(keyword, options={})
  unless keyword ; return false ; end

  unless options[:count]
    options[:count] = 100
  end
  unless options[:offset]
    options[:offset] = 0
  end

  open("#{SEARCH_URL}?STR=#{URI.encode(NKF::nkf('-e', keyword))}&COUNT=#{options[:count]}&OFFSET=#{options[:offset]}&SCEND=A&SORT=MODIFIED&TYPE=TITLE&BBS=ALL").read.each do |line|
    line = NKF::nkf('-wxm0Z0', line.strip)
    if /^
\s*(.+?)<\/a>\s*\((\d+)\).*?(.+?)<\/a>/.match(line)
url, title, current_cursor, board_url, board_name = $1, $2, $3, $4, $5
url.gsub!(/^(http:\/\/.+?\/test\/read.cgi\/.+?\/\d+?)\/.*$/, '\1/')
dat = url.gsub(/^(http:\/\/.+?)\/test\/read.cgi\/(.+?)\/(\d+?)\/.*$/, '\1/\2/dat/\3.dat')
thread_number = url.gsub(/^(http:\/\/.+?)\/test\/read.cgi\/(.+?)\/(\d+?)\/.*$/, '\3')
t = ThreadModel.new(url, dat, thread_number, title, current_cursor, board_url, board_name)
p t
end
end

end

if __FILE__ == $0
search2chThread('原発', :count=> 10, :offset=> 2)
end

__END__

0 件のコメント:

コメントを投稿