require 'rubygems'
require 'mechanize'
rational = Mechanize.new { |agent|
agent.user_agent_alias = 'Windows Mozilla'
}
results = rational.get(ARGV[0])
puts results.content
给了我html,但我想要纯文本。最好的办法是可以格式化。
此代码将为您提供整个文档的无格式文本:
require 'mechanize'
require 'nokogiri'
rational = Mechanize.new { |agent|
agent.user_agent_alias = 'Windows Mozilla'
}
document = Nokogiri::HTML(rational.get(ARGV[0]).content)
#This will give you very dirty result
#results = document.inner_text
#My suggestion is to extract text from some specific element
results = document.css("#content .my-element-with-some-contents").inner_text