require 'rubygems'
require 'hpricot'
class Extract
def Extract.match(opt, &block)
record = Extract::Item.new(opt)
record.instance_eval(&block)
return record.data
end
class Item
attr_accessor :data
def initialize(opt)
@text = opt[:text]
@data = {:summary => {}, :item => []}
end
def summary(opt)
xpath = Hash[*opt.map {|k, v| [k, v] if v.key?(:xpath)}.compact.flatten]
const = Hash[*opt.map {|k, v| [k, v] if v.key?(:const)}.compact.flatten]
regexp = Hash[*opt.map {|k, v| [k, v] if v.key?(:regexp)}.compact.flatten]
if regexp.size
regexp.each_pair {|k, v|
@data[:summary][k] = @text.match(%r{#{v[:regexp]}}m).to_a[1]
}
end
if xpath.size
doc = Hpricot(@text)
xpath.each {|k, v|
elem = doc.at(v[:xpath])
case elem
when Hpricot::Text
@data[:summary][k] = elem.to_s
when Hpricot::Elem
case v[:expr]
when :text
@data[:summary][k] = elem.to_plain_text
when :inner_text
@data[:summary][k] = elem.inner_text
else
@data[:summary][k] = elem.inner_text
end
else
@data[:summary][k] = ''
end
}
end
if const.size
end
end
def record(opt, &block)
if opt.key?(:xpath)
doc = Hpricot(@text)
records = doc.search(opt[:xpath]).collect {|r| r.to_s}
elsif opt.key?(:regexp)
records = @text.scan(%r{#{opt[:regexp]}}m).collect {|r| r.to_s}
end
records.each {|r|
prop = Extract::Property.new(r)
prop.instance_eval(&block)
@data[:item] << prop.data
}
return self
end
end
class Property
attr_accessor :text, :data
def initialize(text)
@text = text
@data = {}
end
def item(opt)
xpath = Hash[*opt.map {|k, v| [k, v] if v.key?(:xpath)}.compact.flatten]
const = Hash[*opt.map {|k, v| [k, v] if v.key?(:const)}.compact.flatten]
regexp = Hash[*opt.map {|k, v| [k, v] if v.key?(:regexp)}.compact.flatten]
if regexp.size
regexp.each_pair {|k, v|
@data[k] = @text.match(%r{#{v[:regexp]}}m).to_a[1]
}
end
if xpath.size
doc = Hpricot(@text)
xpath.each {|k, v|
elem = doc.at(v[:xpath])
case elem
when Hpricot::Text
@data[k] = elem.to_s
when Hpricot::Elem
case v[:expr]
when /inner_text/
@data[k] = elem.inner_text
when /text/
@data[k] = elem.search('/').map{|e| e if e.class == Hpricot::Text }.compact.join('')
when /@(.+)/
@data[k] = elem[$1]
else
@data[k] = elem.inner_text
end
else
@data[k] = ''
end
}
end
if const.size
end
end
end
end