如何从twitter流式JSON响应中分离文本,并使用python对文本进行分析



我正在尝试使用twitter API对文本进行情绪分析。我遇到的问题是,我不理解如何将文本从每条推文中分离出来,也不理解TextBlob库中提供的情感极性分析。更进一步,我希望这只是收回英文推特。输出为JSON格式。

以下是基于关键字(在本例中为"usd"、"euro"、"loonie")生成推文的代码,以及我存储文本并在变量中使用结果的蹩脚尝试:

from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
import re
import pandas as pd
import matplotlib.pyplot as plt

#Variables that contains the user credentials to access Twitter API 
access_token = "xxxx"
access_token_secret = "xxxx"
consumer_key = "xxxx"
consumer_secret = "xxxx"

#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
print data
return True
def on_error(self, status):
print status

if __name__ == '__main__':
#This handles Twitter authentication and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track=['euro', 'dollar', 'loonie', ] )
tweets_data_path = stream.filter
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
print len(tweets_data)
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
wiki = TextBlob(tweets['text'])
r = wiki.sentiment.polarity
print r

这就是输出的样子:

{"created_at":"Sun Jun 14 23:43:31+0000 2015","id":610231121016524801,"id_str":"6102311210165 24801","text":"RT@amirulimann:RM6 diperlukan utk tukar kpd 1磅。\nRM3 diperlucan utk tukal kpd 1S'美元。\n\nRaf matawang jatuh.Tak sedih ke?htt\u2026","source":"\u003ca href=\"http://twitter.com/download/iphone\"rel=\"nofollow\"\u003eTwitter for iPhone\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to-status_id_str":null、"in_reply_to_user_id":null,"user":{"id":42642877,"id_str":"42642877","name":"Wny","screen_name":"waaannyyy","location":"Dirgahayu Darul Makmur","url":null,"description":"Aku serba tiada,Aku kekurangan。","protected":false,"verified":false,"followers_count":320,"friends_count":239,"listed_count":1,"preferentes_count":4344,"statuses_count":34408,"created_at":"2009年5月26日星期二15:10:28+000","utc_offset":28800,"time_zone":"Kuala Lumpur","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false"profile_background_color":"FFFFFFFFFFFF","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/433201191825047553/PM76m-v2.jpeg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/433201191825047553/PM76m-v2.jpeg","profile_background_tile":true,"profil_link_color":"DD2E44","profile _ sidebar_order_colorhttp://pbs.twimg.com/profile_images/609402965795835904/mm6jjRRO_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/609402965795835904/mm6jjRRO_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/42642877/1415486321","default_profile":false,"default.profile_image":false、"follow_requestrongent":null、"notifications":null}、"geo":null,"coordinates":null"place":null;"contributors":null。"retweeted_status":{"created_at":"Sat Jun 13 03:33:29+0000 2015"、"id":609564219495706624、"id_str":"609564219496706624"、"text":"RM6需要1英镑。\nRM3需要额外的1美元。\n\nGraf matawang jatuh。德?http://t.co/dum4skb6uK","source":"\u003ca href=\"http://twitter.com/download/android\"rel=\"nofollow\"\u003eTwitter for Android\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to-status_id_str":null、"in_repli_to_user_id":null;"in_re普利_to_user_id_str":null;"in_reply_to_screen_name":null imann","location":"+06车型年款","url":"http://instagram.com/amirulimannn","description":"我想淹死在她的香水瓶里","protected":false,"verified":false","followers_count":723","friends_count":834","listed_count":2","preferentes_count":4810","statuses_count":50981","created_at":"2012年2月3日星期五07:49:55+00000","utc_offset":28800","time_zone":"Kuala Lumpur","geo_enabled":true","lang":"en","contributors_enabled":false"is _translator":false,"profile_background_color":"AD0A20","profile_bbackground_image_url":"http://pbs.twimg.com/profile_background_images/378800000139426816/61DHBnYy.jpeg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/378800000139426816/61DHBnYy.jpeg","profile_background_tile":false,"profil_link_color":"E36009","profile _ sidebar_order_colorhttp://pbs.twimg.com/profile_images/592744790283911169/dW7S73WA_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/592744790283911169/dW7S73WA_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/481856658/1428379855","default_profile":false,"default.profile_image":false、"follow_requestrongent":null、"notifications":null}、"geo":null,"coordinates":null"place":null;"contributors":null。"retweet_count":1321,"favorite_count":229,"entities":{"hashtags":[]、"trends":[],"urls":[];"user_mentations":[]。"symbol":[]"media":[{"id":609564142886760448,"id_str":"609564142886760448","索引":[118140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dump4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340","h":340,"resize":"fit"},"thumb":{"w":150","h:150","resize":"crop id_str":"609564142886760448","索引":[118140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":">

from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
# Variables that contains the user credentials to access Twitter API
access_token = ''
access_token_secret = ''
consumer_key = ''
consumer_secret = ''

# This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
json_load = json.loads(data)
texts = json_load['text']
coded = texts.encode('utf-8')
s = str(coded)
print(s[2:-1])
return True
def on_error(self, status):
print(status)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, StdOutListener())
# This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track=['euro', 'dollar', 'loonie', ], languages=['en'])

对于您最初关于json的问题:您可以使用json.loads()加载数据流。当您将数据从twitter提取到python上时,其他事情的原因是不会出现charmap错误。s[2:-1]的原因是为了消除编码到utf-8的额外字符。

对于纯英语的推文,您也可以使用languages=['en']直接从流中筛选。

我不熟悉TextBlob库,但您可以通过多种方式存储它,只需将信息写入文件中,运行TextBlob时直接从文件中读取即可。您可以替换print(s[2:-1])或添加:

myfile = open('text.csv','a')
myFile.write(s[2:-1])
myFile.write('n') # adds a line between tweets
myFile.close() 

你可以使用file = open('text.csv', 'r')来进行情绪分析。打开文件时,不要忘记添加file.close()

相关内容

  • 没有找到相关文章

最新更新