データ抽出クラス

ソースコード

require 'rubygems'
require 'hpricot'

class Extract
  def Extract.match(opt, &block)
    record = Extract::Item.new(opt)
    record.instance_eval(&block)
    return record.data
  end

  class Item
    attr_accessor :data

    def initialize(opt)
      @text = opt[:text]
      @data = {:summary => {}, :item => []}
    end

    def summary(opt)
      xpath = Hash[*opt.map {|k, v| [k, v] if v.key?(:xpath)}.compact.flatten]
      const = Hash[*opt.map {|k, v| [k, v] if v.key?(:const)}.compact.flatten]
      regexp = Hash[*opt.map {|k, v| [k, v] if v.key?(:regexp)}.compact.flatten]

      if regexp.size
        regexp.each_pair {|k, v| 
          @data[:summary][k] = @text.match(%r{#{v[:regexp]}}m).to_a[1]
        }
      end

      if xpath.size
        doc = Hpricot(@text)

        xpath.each {|k, v| 
          elem = doc.at(v[:xpath])
          case elem
          when Hpricot::Text
            @data[:summary][k] = elem.to_s
          when Hpricot::Elem
            case v[:expr]
            when :text
              @data[:summary][k] = elem.to_plain_text
            when :inner_text
              @data[:summary][k] = elem.inner_text
            else
              @data[:summary][k] = elem.inner_text
            end
          else
            @data[:summary][k] = ''
          end
        }
      end

      if const.size
      end
    end

    def record(opt, &block)
      if opt.key?(:xpath)
        doc = Hpricot(@text)
        records = doc.search(opt[:xpath]).collect {|r| r.to_s}
      elsif opt.key?(:regexp)
        records = @text.scan(%r{#{opt[:regexp]}}m).collect {|r| r.to_s}
      end

      records.each {|r|
        prop = Extract::Property.new(r)
        prop.instance_eval(&block)
        @data[:item] << prop.data
      }

      return self
    end
  end

  class Property
    attr_accessor :text, :data

    def initialize(text)
      @text = text
      @data = {}
    end

    def item(opt)
      xpath = Hash[*opt.map {|k, v| [k, v] if v.key?(:xpath)}.compact.flatten]
      const = Hash[*opt.map {|k, v| [k, v] if v.key?(:const)}.compact.flatten]
      regexp = Hash[*opt.map {|k, v| [k, v] if v.key?(:regexp)}.compact.flatten]

      if regexp.size
        regexp.each_pair {|k, v| 
          @data[k] = @text.match(%r{#{v[:regexp]}}m).to_a[1]
        }
      end

      if xpath.size
        doc = Hpricot(@text)

        xpath.each {|k, v| 
          elem = doc.at(v[:xpath])
          case elem
          when Hpricot::Text
            @data[k] = elem.to_s
          when Hpricot::Elem
            case v[:expr]
            when /inner_text/
              @data[k] = elem.inner_text
            when /text/
              @data[k] = elem.search('/').map{|e| e if e.class == Hpricot::Text }.compact.join('')
            when /@(.+)/
              @data[k] = elem[$1]
            else
              @data[k] = elem.inner_text
            end
          else
            @data[k] = ''
          end
        }
      end

      if const.size
      end
    end
  end
end