テキスト抽出処理 - A More Beautiful day

regexp:// とか、xpath:// とかおかしな設定方法をやめました。
これである程度、テキストから情報を抽出することができる。

結果は配列にハッシュを入れたものだから、それを RSS でも、
Atom にでも入れてしまえばいい。

require 'rubygems'
require 'hpricot'

class Extract
  def Extract.match(text, opt)
    return [] unless opt.key?(:record) && opt.key?(:item)

    if opt[:record].key?(:xpath)
      doc = Hpricot(text)
      record = doc.search(opt[:record][:xpath]).collect {|r| r.to_s}
    elsif opt[:record].key?(:regexp)
      record = text.scan(/#{opt[:record][:regexp]}/m)
    else
      return []
    end

    item_xpath = Hash[*opt[:item].map {|k, v| [k, v] if v.key?(:xpath)}.compact.flatten]
    item_regexp = Hash[*opt[:item].map {|k, v| [k, v] if v.key?(:regexp)}.compact.flatten]

    data = []

    record.each {|r|
      item = {}

      if item_regexp.size
        item_regexp.each_pair {|key, val| 
          item[key] = r.match(/#{val[:regexp]}/m).to_a[1]
        }
      end

      if item_xpath.size
        doc = Hpricot(r)

        item_xpath.each {|key, val| 
          elem = doc.at(val[:xpath])
          if elem.nil?
            item[key] = ''
          else
            item[key] = elem.to_s
          end
        }
      end

      if opt.key?(:after_hook)
        eval(opt[:after_hook])
      end

      data << item
    }

    data
  end
end

テストコード

  require 'pp'

  text =<<EOD
<root>
  <record>
    <item>
      <name>user01</name>
      <id>0</id>
      <date>2007/11/25</date>
      <comment>comment01 foobar</comment>
    </item>
    <item>
      <name>user02</name>
      <id>1</id>
      <date>2007/11/28</date>
      <comment>comment02 foobar</comment>
    </item>
    <item>
      <name>user03</name>
      <id>2</id>
      <date>2007/11/30</date>
      <comment>comment03 foobar</comment>
    </item>
  </record>
</root>
EOD

  data = Extract.match(text, { 'record_fetch'  => %q{xpath:///root/record/item}, })

  data = Extract.match(text, {
    :record => {
      :xpath => '/root/record/item',  
    },
    :item => {
      :name => {
        :xpath => 'name/text()',
      },
      :date => {
        :xpath => 'date/text()',
      },
      :comment => {
        :regexp => '<comment>(.+?)</comment>',
      },
      :id => {
        :xpath => 'id/text()',
      },
    }
  })

  pp data

実行結果

[{:name=>"user01",
:id=>"0",
:date=>"2007/11/25",
:comment=>"comment01 foobar"},
{:name=>"user02",
:id=>"1",
:date=>"2007/11/28",
:comment=>"comment02 foobar"},
{:name=>"user03",
:id=>"2",
:date=>"2007/11/30",
:comment=>"comment03 foobar"}]