r-压扁嵌套严重且格式错误的JSON文件

我用tweepy提取了一些twitter数据，从那里得到的不是格式最好的json，但我不太想办法做到这一点，因为一个文件不需要几个小时。(我在大约100个文件中有大约50万行twitter数据)。

下面附上我的json文件的一(1)行示例。

如果有帮助的话，这里有一个驱动器链接到我的一个较小的文件。

从字面上讲，任何能做到这一点的方法都将不胜感激。

谢谢！

{"created_at"："2019年2月7日星期四10:58:27+0000"，"id"：109346400120016896，"id_str"："109346400100016896"，"text"："RT@sethia_b:@nparama1951@RBhamaria@jyotsnavarma9@JaganNKaushik@DineshA58@Am_dilip@ashvinn15@SChakram@Savitritvs@Satishrathd100\u2026"，"source"："\u003ca href=\"http://twitter.com/download/android\"rel=\"nofollow\"\u003eTwitter for Android\u003c/a\u003e"，"truncated"：false，"in_reply_to_status_id"：null，"in_reply_to-status_id_str"：null、"in_reply_to_user_id"：null，"user"：｛"id"：1070710084441784320，"id_str"："10707100844.1784320"，"name"："Kumaran"，"screen_name"："Kumaran92023000"，"location"：null，"url"：null"description"：null，"translator_type"："none"，"protected"：false，"verified"：false"followers_count"：222，"friends_count"：427，"listed_count"：0，"preferentes_count":6036，"status_count"6834，"created_at"："2018年12月6日星期四16:02:31+0000"，"utc_offset"：null，"time_zone"：null，"geo_enabled"：false，"lang"："en"，"contributors_enabled"：false，"is_translator"：false"profile_background_color _color"："333333"，"profile_use_background_image"：true，"profile_image_url"："http://pbs.twimg.com/profile_images/1070712098160705542/02c6-KwM_normal.jpg"，"profile_image_url_https"："https://pbs.twimg.com/profile_images/1070712098160705542/02c6-KwM_normal.jpg"，"profile_banner_url"："https://pbs.twimg.com/profile_banners/1070710084441784320/1544112886"，"default_profile"：true，"default.profile_image"：false，"follow_requestrongent"：null，"notifications"：null｝，"geo"：null"coordinates"：null、"place"：null"、"contributors"：null)、"retweeted_status"：｛"created_at"："Thu Feb 07 09:02:19+00000 2019"、"id"：1093434772070658048、"id_str"："109343477207 0658048"、"text"："@nparama1951@RBhamaria@jyotsnavarma9@JaganNKaushik@DineshA58@Am_dilip@ashvinn15@SChakram@Savitritvs\u2026https://t.co/lfjAiatkbP"，"display_text_range"：[117140]，"source"："\u003ca href=\"http://twitter.com/download/android\"rel=\"nofollow\"\u003eTwitter for Android\u003c/a\u003e"，"truncated"：true，"in_reply_to_status_id"：1093433827501006848，"in-reply_to-status_id_str"："109343382750 1006848"，"in_reply_to_user_id"：2783847553，"in_reply_to_user_id_str"，"278384755"，"user"：｛"id"：796185239370379264，"id_str"："7961852339370379264"，"name"："B.Sthia"，"screen_name"："friends_count"：2315，"listed_count"：8，"preferentes_count"：71196，"statuses_count"：56458，"created_at"："2016年11月9日星期三02:58:48+000"，"utc_offset"：null，"time_zone"：null、"geo_enabled"：true，"lang"："en"，"contributors_enabled"：false，"is_translator"：false"，"profile_background_color"："F5F8FA"，"profile_bbackground_image_url"："，"profile _background_image_url_https"："，"profile_background_tile"：false，"profile_link_color"："1DA1F2"，"profile_sidebar_border_color"："C0DEED"，"profile_side bar_fill_colorhttp://pbs.twimg.com/profile_images/1053957260974542848/RrRuZL1g_normal.jpg"，"profile_image_url_https"："https://pbs.twimg.com/profile_images/1053957260974542848/RrRuZL1g_normal.jpg"，"profile_banner_url"："https://pbs.twimg.com/profile_banners/796185239370379264/1531817659"，"default_profile"：true，"default.profile_image"：false，"follow_requestrongent"：null，"notifications"：null｝，"geo"：null"coordinates"：null、"place"：null"、"contributors"：null、"is_quote_status"：false、"extended_tweet"：｛"full_text"："@nparama1951@RBhamaria@jyotsnavarma9@JaganNKaushik@DineshA58@Am_dilip@ashvinn15@SChakram@Savitritvs@Satishrathd100@banerji1@ramakirao@RaghavendraUp16@ads7506@Kumaran92023000@kjayashree31@gouranga1964@SureshanDelhi@ravi_sec@faramroze@kavita_wari@kailashkaushik8@pushprajdumraon@FatychandJ@sukmaranlens@Hydbiryani11@ChaudhrGurnam@DurgaMaddikon da@ShibaBhanja@aarjeekaykannan@singhsantosh98@badi_mishra@LighteningBolt9@birajanath@sn_ojha@Speakwithsence1@dharmvirjangra9@LillyMaryPinto@RohiniShah73@postcard_news@MailOnline@EconomicTimes@narendramodi@BJP4India@AmitShah@TarekFatah@TrueIndology@republic@TelegraphTech 10名纳萨尔人在恰蒂斯加尔邦与安全部队的遭遇中丧生"，"display_text_range"：[666731]，"实体"：｛"标签"：[]，"url"：[]，"user_mentations"：[｛"screen_name"："nparama1951"，"name"："N.Paramasivam"，"id"：2783847553，"id_str"："278384755">

这看起来像ndjson。ndjson和jsonlite包都可以处理它。

dat <- ndjson::stream_in("data/INCIndia26febru.json")
# Source: local data table [1,584 x 2,814]
# 
# # A tibble: 1,584 x 2,814
#    contributors coordinates created_at entities.hashta… entities.symbols entities.urls entities.user_m…
#           <int>       <int> <chr>                 <int>            <int>         <int>            <dbl>
#  1           NA          NA Tue Feb 2…               NA               NA            NA          1.15e 9
#  2           NA          NA Tue Feb 2…               NA               NA            NA          1.15e 9
#  3           NA          NA Tue Feb 2…               NA               NA            NA          2.23e 8
#  4           NA          NA Tue Feb 2…               NA               NA            NA          7.44e 7
#  5           NA          NA Tue Feb 2…               NA               NA            NA          1.06e18
#  6           NA          NA Tue Feb 2…               NA               NA            NA          1.47e 8
#  7           NA          NA Tue Feb 2…               NA               NA            NA          7.44e 7
#  8           NA          NA Tue Feb 2…               NA               NA            NA          1.15e 9
#  9           NA          NA Tue Feb 2…               NA               NA            NA          1.15e 9
# 10           NA          NA Tue Feb 2…               NA               NA            NA          7.44e 7
# # … with 1,574 more rows, and 2,807 more variables: entities.user_mentions.0.id_str <chr>,
# #   entities.user_mentions.0.indices.0 <dbl>, entities.user_mentions.0.indices.1 <dbl>,
# #   entities.user_mentions.0.name <chr>, entities.user_mentions.1.id <dbl>,
# #   entities.user_mentions.0.screen_name <chr>, entities.user_mentions.1.id_str <chr>,
# #   entities.user_mentions.1.indices.0 <dbl>, retweeted_status.entities.user_mentions.0.name <chr>,
# #   retweeted_status.entities.user_mentions.0.screen_name <chr>,
# #   retweeted_status.extended_tweet.display_text_range.0 <dbl>,
# #   retweeted_status.entities.user_mentions.0.id_str <chr>,
# #   retweeted_status.entities.user_mentions.0.indices.0 <dbl>,
# #   retweeted_status.entities.user_mentions.0.indices.1 <dbl>, retweeted_status.entities.urls.0.url <chr>,
# #   retweeted_status.entities.user_mentions.0.id <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.media_url_https <chr>,
# #   retweeted_status.extended_tweet.entities.media.0.sizes.large.h <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.sizes.medium.h <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.sizes.large.resize <chr>,
# #   retweeted_status.extended_tweet.entities.media.0.sizes.large.w <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.indices.0 <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.indices.1 <dbl>,
# #   retweeted_status.extended_tweet.entities.media.0.media_url <chr>, in_reply_to_user_id_str <chr>,
# #   in_reply_to_status_id_str <chr>, is_quote_status <lgl>, lang <chr>, id_str <chr>,
# #   in_reply_to_screen_name <chr>, in_reply_to_status_id <dbl>, in_reply_to_user_id <dbl>, retweet_count <dbl>,
# #   place <int>, quote_count <dbl>, retweeted <lgl>, retweeted_status.contributors <int>, reply_count <dbl>,
# #   retweeted_status.coordinates <int>, retweeted_status.created_at <chr>, user.created_at <chr>,
# #   user.contributors_enabled <lgl>, retweeted_status.user.utc_offset <int>,
# #   retweeted_status.user.verified <lgl>, source <chr>, text <chr>, timestamp_ms <chr>, truncated <lgl>,
# #   user.default_profile <lgl>, user.following <int>, user.default_profile_image <lgl>, user.description <chr>,
# #   user.follow_request_sent <int>, user.followers_count <dbl>, user.friends_count <dbl>,
# #   user.favourites_count <dbl>, quoted_status.extended_tweet.extended_entities.media.0.sizes.thumb.resize <chr>,
# #   quoted_status.extended_tweet.extended_entities.media.0.sizes.thumb.w <dbl>,
# #   quoted_status.extended_tweet.extended_entities.media.0.video_info.aspect_ratio.0 <dbl>,
# #   quoted_status.extended_tweet.extended_entities.media.0.video_info.aspect_ratio.1 <dbl>,
# #   quoted_status.extended_tweet.extended_entities.media.0.type <chr>,
# #   quoted_status.extended_tweet.extended_entities.media.0.video_info.duration_millis <dbl>,
# #   quoted_status.extended_tweet.extended_entities.media.0.video_info.variants.0.bitrate <dbl>,
# #   quoted_status.extended_tweet.extended_entities.media.0.url <chr>,
# #   retweeted_status.entities.hashtags.0.indices.0 <dbl>, retweeted_status.entities.hashtags.0.indices.1 <dbl>,
# #   retweeted_status.entities.hashtags.0.text <chr>, favorite_count <dbl>,
# #   entities.user_mentions.1.indices.1 <dbl>, entities.user_mentions.1.name <chr>,
# #   entities.user_mentions.1.screen_name <chr>, favorited <lgl>, filter_level <chr>, geo <int>, id <dbl>,
# #   retweeted_status.favorite_count <dbl>, retweeted_status.favorited <lgl>, retweeted_status.filter_level <chr>,
# #   retweeted_status.extended_tweet.full_text <chr>,
# #   retweeted_status.extended_tweet.extended_entities.media.1.sizes.thumb.resize <chr>,
# #   retweeted_status.extended_tweet.extended_entities.media.1.sizes.thumb.w <dbl>,
# #   retweeted_status.extended_tweet.extended_entities.media.1.type <chr>,
# #   retweeted_status.extended_tweet.extended_entities.media.1.url <chr>, user.id_str <chr>,
# #   user.is_translator <lgl>, user.lang <chr>, user.listed_count <dbl>, user.location <chr>, user.name <chr>,
# #   user.geo_enabled <lgl>, user.id <dbl>, retweeted_status.display_text_range.0 <dbl>,
# #   retweeted_status.display_text_range.1 <dbl>, retweeted_status.entities.hashtags <int>,
# #   retweeted_status.entities.symbols <int>, retweeted_status.entities.urls.0.display_url <chr>,
# #   retweeted_status.entities.urls.0.expanded_url <chr>, retweeted_status.entities.urls.0.indices.0 <dbl>,
# #   retweeted_status.entities.urls.0.indices.1 <dbl>, retweeted_status.geo <int>, …

或者：

dat <- jsonlite::stream_in(file("data/INCIndia26febru.json"))
tibble::glimpse(dat)
# Observations: 1,584
# Variables: 36
# $ created_at                <chr> "Tue Feb 26 13:09:36 +0000 2019", "Tue Feb 26 13:09:38 +0000 2019", "Tue Feb …
# $ id                        <dbl> 1.100382e+18, 1.100382e+18, 1.100382e+18, 1.100382e+18, 1.100382e+18, 1.10038…
# $ id_str                    <chr> "1100382373823299586", "1100382382396448770", "1100382385013645314", "1100382…
# $ text                      <chr> "RT @INCIndia: Congress President @RahulGandhi addresses North-East DCC &amp;…
# $ source                    <chr> "<a href="http://twitter.com/download/android" rel="nofollow">Twitter for…
# $ truncated                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
# $ in_reply_to_status_id     <dbl> NA, 1.100360e+18, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# $ in_reply_to_status_id_str <chr> NA, "1100359823630237697", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
# $ in_reply_to_user_id       <dbl> NA, 1.153045e+09, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# $ in_reply_to_user_id_str   <chr> NA, "1153045459", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
# $ in_reply_to_screen_name   <chr> NA, "INCIndia", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ user                      <data.frame> <data.frame[38 x 39]>
# $ geo                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ coordinates               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ place                     <data.frame> <data.frame[38 x 9]>
# $ contributors              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# $ retweeted_status          <data.frame> <data.frame[38 x 34]>
# $ is_quote_status           <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FA…
# $ quote_count               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ reply_count               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ retweet_count             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ favorite_count            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# $ entities                  <data.frame> <data.frame[38 x 5]>
# $ favorited                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
# $ retweeted                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
# $ filter_level              <chr> "low", "low", "low", "low", "low", "low", "low", "low", "low", "low", "low", …
# $ lang                      <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en",…
# $ timestamp_ms              <chr> "1551186576692", "1551186578736", "1551186579360", "1551186579893", "15511865…
# $ display_text_range        <list> [NULL, <23, 100>, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL…
# $ possibly_sensitive        <lgl> NA, NA, NA, NA, FALSE, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
# $ quoted_status_id          <dbl> NA, NA, NA, NA, NA, NA, 1.100365e+18, NA, NA, 1.100365e+18, NA, NA, 1.100365e…
# $ quoted_status_id_str      <chr> NA, NA, NA, NA, NA, NA, "1100364859768782849", NA, NA, "1100364859768782849",…
# $ quoted_status             <data.frame> <data.frame[38 x 32]>
# $ quoted_status_permalink   <data.frame> <data.frame[38 x 3]>
# $ extended_tweet            <data.frame> <data.frame[38 x 4]>
# $ extended_entities         <data.frame> <data.frame[38 x 1]>

相关内容

最新更新

热门标签：