r-压扁嵌套严重且格式错误的JSON文件



我用tweepy提取了一些twitter数据,从那里得到的不是格式最好的json,但我不太想办法做到这一点,因为一个文件不需要几个小时。(我在大约100个文件中有大约50万行twitter数据)。

下面附上我的json文件的一(1)行示例。

如果有帮助的话,这里有一个驱动器链接到我的一个较小的文件。

从字面上讲,任何能做到这一点的方法都将不胜感激。

谢谢!

{"created_at":"2019年2月7日星期四10:58:27+0000","id":109346400120016896,"id_str":"109346400100016896","text":"RT@sethia_b:@nparama1951@RBhamaria@jyotsnavarma9@JaganNKaushik@DineshA58@Am_dilip@ashvinn15@SChakram@Savitritvs@Satishrathd100\u2026","source":"\u003ca href=\"http://twitter.com/download/android\"rel=\"nofollow\"\u003eTwitter for Android\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to-status_id_str":null、"in_reply_to_user_id":null,"user":{"id":1070710084441784320,"id_str":"10707100844.1784320","name":"Kumaran","screen_name":"Kumaran92023000","location":null,"url":null"description":null,"translator_type":"none","protected":false,"verified":false"followers_count":222,"friends_count":427,"listed_count":0,"preferentes_count":6036,"status_count"6834,"created_at":"2018年12月6日星期四16:02:31+0000","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false"profile_background_color _color":"333333","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/1070712098160705542/02c6-KwM_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/1070712098160705542/02c6-KwM_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/1070710084441784320/1544112886","default_profile":true,"default.profile_image":false,"follow_requestrongent":null,"notifications":null},"geo":null"coordinates":null、"place":null"、"contributors":null)、"retweeted_status":{"created_at":"Thu Feb 07 09:02:19+00000 2019"、"id":1093434772070658048、"id_str":"109343477207 0658048"、"text":"@nparama1951@RBhamaria@jyotsnavarma9@JaganNKaushik@DineshA58@Am_dilip@ashvinn15@SChakram@Savitritvs\u2026https://t.co/lfjAiatkbP","display_text_range":[117140],"source":"\u003ca href=\"http://twitter.com/download/android\"rel=\"nofollow\"\u003eTwitter for Android\u003c/a\u003e","truncated":true,"in_reply_to_status_id":1093433827501006848,"in-reply_to-status_id_str":"109343382750 1006848","in_reply_to_user_id":2783847553,"in_reply_to_user_id_str","278384755","user":{"id":796185239370379264,"id_str":"7961852339370379264","name":"B.Sthia","screen_name":"friends_count":2315,"listed_count":8,"preferentes_count":71196,"statuses_count":56458,"created_at":"2016年11月9日星期三02:58:48+000","utc_offset":null,"time_zone":null、"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false","profile_background_color":"F5F8FA","profile_bbackground_image_url":","profile _background_image_url_https":","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_side bar_fill_colorhttp://pbs.twimg.com/profile_images/1053957260974542848/RrRuZL1g_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/1053957260974542848/RrRuZL1g_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/796185239370379264/1531817659","default_profile":true,"default.profile_image":false,"follow_requestrongent":null,"notifications":null},"geo":null"coordinates":null、"place":null"、"contributors":null、"is_quote_status":false、"extended_tweet":{"full_text":"@nparama1951@RBhamaria@jyotsnavarma9@JaganNKaushik@DineshA58@Am_dilip@ashvinn15@SChakram@Savitritvs@Satishrathd100@banerji1@ramakirao@RaghavendraUp16@ads7506@Kumaran92023000@kjayashree31@gouranga1964@SureshanDelhi@ravi_sec@faramroze@kavita_wari@kailashkaushik8@pushprajdumraon@FatychandJ@sukmaranlens@Hydbiryani11@ChaudhrGurnam@DurgaMaddikon da@ShibaBhanja@aarjeekaykannan@singhsantosh98@badi_mishra@LighteningBolt9@birajanath@sn_ojha@Speakwithsence1@dharmvirjangra9@LillyMaryPinto@RohiniShah73@postcard_news@MailOnline@EconomicTimes@narendramodi@BJP4India@AmitShah@TarekFatah@TrueIndology@republic@TelegraphTech 10名纳萨尔人在恰蒂斯加尔邦与安全部队的遭遇中丧生","display_text_range":[666731],"实体":{"标签":[],"url":[],"user_mentations":[{"screen_name":"nparama1951","name":"N.Paramasivam","id":2783847553,"id_str":"278384755">

这看起来像ndjson。ndjsonjsonlite包都可以处理它。

dat <- ndjson::stream_in("data/INCIndia26febru.json")
# Source: local data table [1,584 x 2,814]
# 
# # A tibble: 1,584 x 2,814
#    contributors coordinates created_at entities.hashta… entities.symbols entities.urls entities.user_m…
#           <int>       <int> <chr>                 <int>            <int>         <int>            <dbl>
#  1           NA          NA Tue Feb 2…               NA               NA            NA          1.15e 9
#  2           NA          NA Tue Feb 2…               NA               NA            NA          1.15e 9
#  3           NA          NA Tue Feb 2…               NA               NA            NA          2.23e 8
#  4           NA          NA Tue Feb 2…               NA               NA            NA          7.44e 7
#  5           NA          NA Tue Feb 2…               NA               NA            NA          1.06e18
#  6           NA          NA Tue Feb 2…               NA               NA            NA          1.47e 8
#  7           NA          NA Tue Feb 2…               NA               NA            NA          7.44e 7
#  8           NA          NA Tue Feb 2…               NA               NA            NA          1.15e 9
#  9           NA          NA Tue Feb 2…               NA               NA            NA          1.15e 9
# 10           NA          NA Tue Feb 2…               NA               NA            NA          7.44e 7
# # … with 1,574 more rows, and 2,807 more variables: entities.user_mentions.0.id_str <chr>,
# #   entities.user_mentions.0.indices.0 <dbl>, entities.user_mentions.0.indices.1 <dbl>,
# #   entities.user_mentions.0.name <chr>, entities.user_mentions.1.id <dbl>,
# #   entities.user_mentions.0.screen_name <chr>, entities.user_mentions.1.id_str <chr>,
# #   entities.user_mentions.1.indices.0 <dbl>, retweeted_status.entities.user_mentions.0.name <chr>,
# #   retweeted_status.entities.user_mentions.0.screen_name <chr>,
# #   retweeted_status.extended_tweet.display_text_range.0 <dbl>,
# #   retweeted_status.entities.user_mentions.0.id_str <chr>,
# #   retweeted_status.entities.user_mentions.0.indices.0 <dbl>,
# #   retweeted_status.entities.user_mentions.0.indices.1 <dbl>, retweeted_status.entities.urls.0.url <chr>,
# #   retweeted_status.entities.user_mentions.0.id <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.media_url_https <chr>,
# #   retweeted_status.extended_tweet.entities.media.0.sizes.large.h <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.sizes.medium.h <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.sizes.large.resize <chr>,
# #   retweeted_status.extended_tweet.entities.media.0.sizes.large.w <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.indices.0 <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.indices.1 <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.media_url <chr>, in_reply_to_user_id_str <chr>,
# #   in_reply_to_status_id_str <chr>, is_quote_status <lgl>, lang <chr>, id_str <chr>,
# #   in_reply_to_screen_name <chr>, in_reply_to_status_id <dbl>, in_reply_to_user_id <dbl>, retweet_count <dbl>,
# #   place <int>, quote_count <dbl>, retweeted <lgl>, retweeted_status.contributors <int>, reply_count <dbl>,
# #   retweeted_status.coordinates <int>, retweeted_status.created_at <chr>, user.created_at <chr>,
# #   user.contributors_enabled <lgl>, retweeted_status.user.utc_offset <int>,
# #   retweeted_status.user.verified <lgl>, source <chr>, text <chr>, timestamp_ms <chr>, truncated <lgl>,
# #   user.default_profile <lgl>, user.following <int>, user.default_profile_image <lgl>, user.description <chr>,
# #   user.follow_request_sent <int>, user.followers_count <dbl>, user.friends_count <dbl>,
# #   user.favourites_count <dbl>, quoted_status.extended_tweet.extended_entities.media.0.sizes.thumb.resize <chr>,
# #   quoted_status.extended_tweet.extended_entities.media.0.sizes.thumb.w <dbl>,
# #   quoted_status.extended_tweet.extended_entities.media.0.video_info.aspect_ratio.0 <dbl>,
# #   quoted_status.extended_tweet.extended_entities.media.0.video_info.aspect_ratio.1 <dbl>,
# #   quoted_status.extended_tweet.extended_entities.media.0.type <chr>,
# #   quoted_status.extended_tweet.extended_entities.media.0.video_info.duration_millis <dbl>,
# #   quoted_status.extended_tweet.extended_entities.media.0.video_info.variants.0.bitrate <dbl>,
# #   quoted_status.extended_tweet.extended_entities.media.0.url <chr>,
# #   retweeted_status.entities.hashtags.0.indices.0 <dbl>, retweeted_status.entities.hashtags.0.indices.1 <dbl>,
# #   retweeted_status.entities.hashtags.0.text <chr>, favorite_count <dbl>,
# #   entities.user_mentions.1.indices.1 <dbl>, entities.user_mentions.1.name <chr>,
# #   entities.user_mentions.1.screen_name <chr>, favorited <lgl>, filter_level <chr>, geo <int>, id <dbl>,
# #   retweeted_status.favorite_count <dbl>, retweeted_status.favorited <lgl>, retweeted_status.filter_level <chr>,
# #   retweeted_status.extended_tweet.full_text <chr>,
# #   retweeted_status.extended_tweet.extended_entities.media.1.sizes.thumb.resize <chr>,
# #   retweeted_status.extended_tweet.extended_entities.media.1.sizes.thumb.w <dbl>,
# #   retweeted_status.extended_tweet.extended_entities.media.1.type <chr>,
# #   retweeted_status.extended_tweet.extended_entities.media.1.url <chr>, user.id_str <chr>,
# #   user.is_translator <lgl>, user.lang <chr>, user.listed_count <dbl>, user.location <chr>, user.name <chr>,
# #   user.geo_enabled <lgl>, user.id <dbl>, retweeted_status.display_text_range.0 <dbl>,
# #   retweeted_status.display_text_range.1 <dbl>, retweeted_status.entities.hashtags <int>,
# #   retweeted_status.entities.symbols <int>, retweeted_status.entities.urls.0.display_url <chr>,
# #   retweeted_status.entities.urls.0.expanded_url <chr>, retweeted_status.entities.urls.0.indices.0 <dbl>,
# #   retweeted_status.entities.urls.0.indices.1 <dbl>, retweeted_status.geo <int>, …

或者:

dat <- jsonlite::stream_in(file("data/INCIndia26febru.json"))
tibble::glimpse(dat)
# Observations: 1,584
# Variables: 36
# $ created_at                <chr> "Tue Feb 26 13:09:36 +0000 2019", "Tue Feb 26 13:09:38 +0000 2019", "Tue Feb …
# $ id                        <dbl> 1.100382e+18, 1.100382e+18, 1.100382e+18, 1.100382e+18, 1.100382e+18, 1.10038…
# $ id_str                    <chr> "1100382373823299586", "1100382382396448770", "1100382385013645314", "1100382…
# $ text                      <chr> "RT @INCIndia: Congress President @RahulGandhi addresses North-East DCC &amp;…
# $ source                    <chr> "<a href="http://twitter.com/download/android" rel="nofollow">Twitter for…
# $ truncated                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
# $ in_reply_to_status_id     <dbl> NA, 1.100360e+18, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# $ in_reply_to_status_id_str <chr> NA, "1100359823630237697", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
# $ in_reply_to_user_id       <dbl> NA, 1.153045e+09, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# $ in_reply_to_user_id_str   <chr> NA, "1153045459", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# $ in_reply_to_screen_name   <chr> NA, "INCIndia", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ user                      <data.frame> <data.frame[38 x 39]>
# $ geo                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ coordinates               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ place                     <data.frame> <data.frame[38 x 9]>
# $ contributors              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ retweeted_status          <data.frame> <data.frame[38 x 34]>
# $ is_quote_status           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FA…
# $ quote_count               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ reply_count               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ retweet_count             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ favorite_count            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ entities                  <data.frame> <data.frame[38 x 5]>
# $ favorited                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
# $ retweeted                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
# $ filter_level              <chr> "low", "low", "low", "low", "low", "low", "low", "low", "low", "low", "low", …
# $ lang                      <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en",…
# $ timestamp_ms              <chr> "1551186576692", "1551186578736", "1551186579360", "1551186579893", "15511865…
# $ display_text_range        <list> [NULL, <23, 100>, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL…
# $ possibly_sensitive        <lgl> NA, NA, NA, NA, FALSE, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
# $ quoted_status_id          <dbl> NA, NA, NA, NA, NA, NA, 1.100365e+18, NA, NA, 1.100365e+18, NA, NA, 1.100365e…
# $ quoted_status_id_str      <chr> NA, NA, NA, NA, NA, NA, "1100364859768782849", NA, NA, "1100364859768782849",…
# $ quoted_status             <data.frame> <data.frame[38 x 32]>
# $ quoted_status_permalink   <data.frame> <data.frame[38 x 3]>
# $ extended_tweet            <data.frame> <data.frame[38 x 4]>
# $ extended_entities         <data.frame> <data.frame[38 x 1]>

最新更新