这是导致问题的脚本函数:
def crawl feedbacks, source, project_id, url, use_spam_filter
@logger.info url
xml = open(url)
doc = Nokogiri::HTML(xml, nil, 'UTF-8')
doc.xpath("//entry").each do |entry|
title = entry.at("./title").content
content = entry.at("./content").content
content.force_encoding('UTF-8')
content = content.gsub(/[^0-9a-z ]/i, '')
language = @language_detector.detect(content)
if language != 'en'
puts "#{language}: #{title}"
next
end
if use_spam_filter && @spam_filter.is_spam?(content)
puts "spam: #{title}"
next
end
#content = strip_invalid_utf8_chars(content)
puts "encoding: #{content.encoding.name }"
polarity, description = @sentiment_classifier.process(content)
published = Time.zone.parse(entry.at("./published").content)
link = entry.at("./link[@rel='alternate']")["href"]
author_image = entry.at("./link[@rel='image']")["href"] rescue nil
author_name = entry.at("./author/name").content
author_url = entry.at("./author/uri").content
if source == Feedback::BLOG && @url_filter.should_ignore(link)
puts "urlfilter: #{title}"
next
elsif source == Feedback::TWITTER && @author_filter.should_ignore(author_name)
puts "authorfilter: #{title}"
next
end
feedbacks << [project_id, published, title, description, link, polarity, author_image, author_name, author_url, source, project_id.to_s + link]
end
rescue Exception => e
puts e
puts e.backtrace.join("n")
@logger.info e.message
@logger.info e.backtrace.join("n")
end
每当爬网程序解析以下URL时,我都会得到UTF-8错误中的无效字节序列:
http://blogsearch.google.com/blogsearch_feeds?hl=en&q=%22Goodyear%22&ie=utf-8&num=100&输出=原子&as_oq=固特异+轮胎
和
http://search.twitter.com/search.atom?q=Goodyear&rpp=100&短语=固特异+轮胎
content.encoding.name总是显示UTF-8,但我不明白为什么会出现错误
兼容性问题不得不重新安装最新版本的Nokogiri