从tripadvisor获得评论评级泡沫



如何从tripadvisor 获得审查

这是我使用beautiuloup 的代码

review_data = data.find_all('div', attrs={'class':'reviews-tab'})
for review in review_data:
    namareview = review.findNext('a', attrs={'class':'ui_header_link social-member-event-MemberEventOnObjectBlock__member--35-jC'})[0].text.strip()
    ratingreview =
    tittlereview = data.find_all('a', attrs={'class':'location-review-review-list-parts-ReviewTitle__reviewTitleText--2tFRT'})[0].text.strip()
    print (namareview)

以及如何从泡沫评级中获得价值评级审查

<span class="ui_bubble_rating bubble_30"></span>

这是我的代码现在

from bs4 import BeautifulSoup
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
url = "https://www.tripadvisor.com/Attraction_Review-g297722-d6611509-Reviews-Pahawang_Island-Bandar_Lampung_Lampung_Sumatra.html"
response = requests.get(url)
data = BeautifulSoup(response.text, "html.parser")
print(data.title.text)
nama = data.find_all('h1', attrs={'class':'ui_header h1'})[0].text.strip()
print (nama)
category = data.find_all('div', attrs={'class':'attractions-attraction-review-header-AttractionLinks__detail--2-xvX'})[0].text.strip()
print (category)
location= data.find_all('div', attrs={'class':'attractions-contact-card-ContactCard__contactRow--3Ih6v'})[0].text.strip()
print (location)
review_data = data.find_all('div', attrs={'class':'reviews-tab'})
for review in review_data:
    namareview = review.findNext('a', attrs={'class':'ui_header_link social-member-event-MemberEventOnObjectBlock__member--35-jC'})[0].text.strip()
    bubblereview=
    tittlereview = data.find_all('a', attrs={'class':'location-review-review-list-parts-ReviewTitle__reviewTitleText--2tFRT'})[0].text.strip()
    print (namareview,bubblereview,tittlereview)

这是我的完整代码

Tripadvisor是一个棘手的网站。但并非不可能。不确定你想要什么,但你可以在脚本标记中解析json:

from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import json


url = "https://www.tripadvisor.co.uk/Attraction_Review-g297722-d6611509-Reviews-Pahawang_Island-Bandar_Lampung_Lampung_Sumatra.html"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
response = requests.get(url, headers=headers)
data = BeautifulSoup(response.text, "html.parser")
print(data.title.text)
nama = data.find_all('h1', attrs={'class':'ui_header h1'})[0].text.strip()
print (nama)
category = data.find_all('div', attrs={'class':'attractions-attraction-review-header-AttractionLinks__detail--2-xvX'})[0].text.strip()
print (category)
location= data.find_all('div', attrs={'class':'attractions-contact-card-ContactCard__contactRow--3Ih6v'})[0].text.strip()
print (location)
# Get Total count of reviews
data = BeautifulSoup(response.text, "html.parser")
reviewDataIDs = []
scripts = data.find_all('script')
for script in scripts:
    if 'window.__WEB_CONTEXT__=' in script.text:
        jsonStr = script.text
        jsonStr = jsonStr.split('window.__WEB_CONTEXT__={pageManifest:')[-1]
        iterateJson = True
        while iterateJson == True:
            try:
                jsonData = json.loads(jsonStr + '}')
                iterateJson = False
            except:
                jsonStr = jsonStr.rsplit('}',1)[0]

raiseError = True
for k, v in jsonData['urqlCache'].items():
    try:
        totalCount = jsonData['urqlCache'][k]['data']['locations'][0]['reviewListPage']['totalCount']  
        raiseError = False
        reviewDataIDs.append(k)
        break
    except:
        pass


def getJsonData(reviewCount, reviewDataIDs, continueLoop):
    while continueLoop == True:
        url = "https://www.tripadvisor.co.uk/Attraction_Review-g297722-d6611509-Reviews-or%s-Pahawang_Island-Bandar_Lampung_Lampung_Sumatra.html#REVIEWS"  %reviewCount   
        response = requests.get(url, headers=headers)
        data = BeautifulSoup(response.text, "html.parser")
        scripts = data.find_all('script')
        for script in scripts:
            if 'window.__WEB_CONTEXT__=' in script.text:
                jsonStr = script.text
                jsonStr = jsonStr.split('window.__WEB_CONTEXT__={pageManifest:')[-1]
                iterateJson = True
                while iterateJson == True:
                    try:
                        jsonData = json.loads(jsonStr + '}')
                        iterateJson = False
                    except:
                        jsonStr = jsonStr.rsplit('}',1)[0]

        raiseError = True
        for k, v in jsonData['urqlCache'].items():
            try:
                reviewData = jsonData['urqlCache'][k]['data']['locations'][0]['reviewListPage']['reviews']  
                raiseError = False
                if k not in reviewDataIDs:#
                    continueLoop = False
                    reviewDataIDs.append(k)
                break
            except:
                pass
        if raiseError == True:
            raise ValueError ('Data could not be found.')
        if continueLoop == False:
            return reviewData, reviewDataIDs

# Get Reviews
for reviewCount in list(range(0,totalCount,5)):
    reviewData, reviewDataIDs = getJsonData(reviewCount, reviewDataIDs, continueLoop=True)
    for each in reviewData:
        rating = each['rating']
        title = each['title']
        text = each['text']
        user = each['username']
        print  ('Name: %snTitle: %snRating: %snReview: %sn' %(user, title, rating, text) + '-'*70 + 'n')

输出:

Name: Hamdan O
Title: Great for snorkelling and beach fun
Rating: 4
Review: Get a boat from Ketapang Jetty. There were 4 piers with lots of boats to choose from. Choose from traditional wooden boats which are cheaper but slow paced or higher priced fast fiberglass speed boats. We haggled for a fast speed boat to take us snorkelling and island hopping for half a day at 700K Rupiah. We got it from Pak Yayat at Pier 2. Pahawang is excellent for snorkelling. Just off shore the island the residents built platforms with small food/drink booths. They moored the boats there as bases for snorkelling. You can hop from one platform to another. Fantastic ideas to preserve the corals but unfortunately the inexperienced snorkellers ravaged through some of the patches closer to te beach. Great for an overnight trip as well at some of the local folks' homestays on the island.
----------------------------------------------------------------------
Name: PaulusKK
Title: he Trip is just So So
Rating: 3
Review: the boat trip to Pahawang island to me is a bit unsafe, it was a small wooden boat, and the journey was bumpy with high waves, and the island itself almost have no attraction, and the lunch provided there was not good, I only enjoy the fresh coconut water.
----------------------------------------------------------------------
Name: damarwianggo
Title: Pahawang is awesome
Rating: 5
Review: It was a story that Pahawang Island is great place to visit. Then, when I had a chance to accompany students from SMAK IPEKA Palembang to visit Pahawang Island in Lampung, Pahawang is truly exciting. Our one-day-trip to Pahawang was really extraordinary. Moreover, all the students were really excited to join all activities during the trip. The guide helped us to enjoy the trip.
----------------------------------------------------------------------
Name: deddy p
Title: Awesome
Rating: 5
Review: One word i can tell about Pahawang..... Superb. Clean water, beautiful corals. Hope you can help to take care this beautiful environment. Keep it clean.....stay away from plastic.
----------------------------------------------------------------------
Name: kristi0308
Title: Clean beach
Rating: 3
Review: I felt like in pulau pari seribu island for the view 
The corals are dead but i saw lots of babies baracudas and a huge purple jellyfish and still got so many pretty little fish
Water are clean and people are not careless about environment as it was very clean when i swam in the island
Thanks to my boat man i paid him only 400k just for a day trip by myself
Paid boat parking every time i move like around 15-20k 
And snorkel gear for 30k
----------------------------------------------------------------------     

气泡评级的值表示为类名末尾的数字。每个气泡的值为10,因此ui_bubble_rating bubble_30是填充了5个气泡中的3个的评级。同样地,CCD_ 2将具有填充的5个气泡中的4.5个。由于数字发生变化,您可以使用正则表达式查找所有这些实例。

bubblereview = data.find_all('span', {'class': re.compile('ui_bubble_rating bubble_d*')})

结果列表:

[<span class="ui_bubble_rating bubble_45"></span>,
 <span class="ui_bubble_rating bubble_45"></span>,
 <span class="ui_bubble_rating bubble_40"></span>,
 <span class="ui_bubble_rating bubble_30"></span>,
 <span class="ui_bubble_rating bubble_50"></span>,
 <span class="ui_bubble_rating bubble_50"></span>,
 <span class="ui_bubble_rating bubble_30"></span>,
 <span class="ui_bubble_rating bubble_40"></span>,
 <span class="ui_bubble_rating bubble_35"></span>,
 <span class="ui_bubble_rating bubble_40"></span>,
 <span class="ui_bubble_rating bubble_40"></span>,
 <span class="ui_bubble_rating bubble_45"></span>,
 <span class="ui_bubble_rating bubble_40"></span>]

你可以这样过滤掉评级:

ratings = re.findall('d+', ''.join(map(str, bubblereview)))
# ['45', '45', '40', '30', '50', '50', '30', '40', '35', '40', '40', '45', '40']

尝试这个循环:

for review in data.select("div[class*='SingleReview']"):
    title= review.select_one(":scope a > span > span").get_text()
    buble_tag = review.select_one(":scope span[class*='bubble']")
    raiting = buble_tag["class"][-1].split("_")[-1]
    print(f"({raiting}){title}")

这也适用于每页只需要5条评论。。。

 bubblereview = soup.find_all('div', {'class': re.compile('nf9vGX55')})

最新更新