目前,当搜索网络数据抓取的代码时,会发现大多数代码只能从单个页面抓取数据。本文的目的是带了解一个Python代码的过程,该代码可以自动从多个网页抓取数据。
Beautiful Soup是一个用于解析HTML和XML文档的Python包。它为解析后的页面创建了一个解析树,可以用来从HTML中提取数据,这对于网络数据抓取非常有用。
import requests
from bs4 import BeautifulSoup as bs
from random import randint
from time import sleep
from itertools import repeat
以下是用于存储提取数据的列表。
name = []
cust_name = []
titles = []
review_title = []
ratings = []
rate = []
reviews = []
review_content = []
在代码行URLS中,需要粘贴评论页面第2页的链接。第2页的正常链接格式如下:
for URL in URLS:
for page in range(2,11):
pages = requests.get(URL + str(page))
print(URL)
print(URL + str(page))
print(pages)
soup = bs(pages.content,'html.parser')
names = soup.find_all('span',class_='a-profile-name')
name.append(names)
for i in range(0,len(names)):
cust_name.append(names[i].get_text())
title = soup.find_all('a',class_='review-title-content')
titles.append(title)
for i in range(0,len(title)):
review_title.append(title[i].get_text())
review_title[:] = [titles1.lstrip('n') for titles1 in review_title]
review_title[:] = [titles1.rstrip('n') for titles1 in review_title]
rating = soup.find_all('i',class_='review-rating')
ratings.append(rating)
for i in range(0,len(rating)):
rate.append(rating[i].get_text())
review = soup.find_all("span",{"data-hook":"review-body"})
reviews.append(review)
for i in range(0,len(review)):
review_content.append(review[i].get_text())
review_content[:] = [reviews.lstrip('n') for reviews in review_content]
review_content[:] = [reviews.rstrip('n') for reviews in review_content]
以下代码段将打印出提取到的客户名称、评论标题、评分和评论内容的数量。
print("Customer names are ",cust_name)
print(len(cust_name))
print("Review Title is ",review_title)
print(len(review_title))
print("Number Ratings are ",rate)
print(len(rate))
print("Actual reviews are ",review_content)
print(len(review_content))
import pandas as pd
df1 = pd.DataFrame()
df1['Customer Name']=cust_name
df2 = pd.DataFrame()
df2['Review title']=review_title
df3 = pd.DataFrame()
df3['Ratings']=rate
df4 = pd.DataFrame()
df4['Reviews']=review_content
frames = [df1, df2, df3, df4]
result = pd.concat(frames,axis=1)
result.shape
result['Ratings'].value_counts()
result['Ratings'].isnull().sum()
result.isnull().sum()
result = result.dropna(axis=0, subset=['Ratings'])
result.isnull().sum()
result.shape