(1)F12控制台找到并分析Network
(2)观察页面,能否找到重复可循环
(1)引入包
import requests
import re
import pandas as pd
(2)准备工作
# 1.拿到网页url链接url = "https://www.71ab.com/company/search.php?page=1"
# 2.通过requests获取页面的url链接response = requests.get(url)
# 3.编码设置为utf8html = response.content.decode("UTF-8")
(3)re登场
# 4.通过re库的findall查找(不理解的请到下面疑问(1)查看)
data = re.findall('<strong class="px14">(.*?)</strong>',html,re.S)
(4)pandas登场
# 5.通过pandas导出csv文件
message = pd.DataFrame().append(data)
new_date = pd.concat([message], axis=0)
# index=False 不加索引 mode='a' 数据追加 header=False
new_date.to_csv('中国企业在线.csv', index=False,mode='a',header=False)
(5)截止到这里,已经可以爬取第一页数据了
(6)在批量爬取数据之前,我们需要变成蒙面人
# 6.加上headers
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
(7)伪装IP(数据量小可以不用,数据量很大就要考虑啦,要不就等着顺着网线找你吧)
推荐蚂蚁代理,亲测好用(不是广告,因为全程是真的免费)
(8)批量爬取数据(对页面进行了改造,看的更方便些,贴全部代码)
import requests
import re
import pandas as pd
def main(url):
# 1.拿到网页url链接
# url = "https://www.71ab.com/company/search.php?page=1"
# 6.加上headers
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
# 2.通过requests获取页面的url链接
response = requests.get(url)
# 3.编码设置为utf8
html = response.content.decode("UTF-8")
# 4.re登场
data = re.findall('<strong class="px14">(.*?)</strong>',html,re.S)
# 5.通过pandas导出csv文件
message = pd.DataFrame().append(data)
new_date = pd.concat([message], axis=0)
new_date.to_csv('中国企业在线.csv', index=False,mode='a',header=False)
if __name__ == '__main__':
# 7.批量爬取数据
urls = ['https://www.71ab.com/company/search.php?page={}'.format(str(i)) for i in range(2, 5)]
# 8.遍历数据
for url in urls:
print(url) # 可以在控制台实时观看到url
main(url)
(1)re.findall
# 01.返回string中所有与pattern相匹配的全部字串,返回形式为数组
findall(pattern, string, flags=0)
# 02.findall查找全部r标识代表后面是正则的语句
regular_v1 = re.findall(r"docs","https://docs.python.org/3/whatsnew/3.6.html")
print (regular_v1)
# ['docs']
# 03.符号^表示匹配以https开头的的字符串返回,
regular_v2 = re.findall(r"^https","https://docs.python.org/3/whatsnew/3.6.html")
print (regular_v2)
# ['https']
# 04.用$符号表示以html结尾的字符串返回,判断是否字符串结束的字符串
regular_v3 = re.findall(r"html#34;,"https://docs.python.org/3/whatsnew/3.6.html")
print (regular_v3)
# ['html']
# 05.[...]匹配括号中的其中一个字符
regular_v4 = re.findall(r"[t,w]h","https://docs.python.org/3/whatsnew/3.6.html")
print (regular_v4)
# ['th', 'wh']
# 06.“d”是正则语法规则用来匹配0到9之间的数返回列表
regular_v5 = re.findall(r"d","https://docs.python.org/3/whatsnew/3.6.html")
regular_v6 = re.findall(r"ddd","https://docs.python.org/3/whatsnew/3.6.html/1234")
print (regular_v5)
# ['3', '3', '6']
print (regular_v6)
# ['123']
# 07.小d表示取数字0-9,大D表示不要数字,也就是出了数字以外的内容返回
regular_v7 = re.findall(r"D","https://docs.python.org/3/whatsnew/3.6.html")
print (regular_v7)
# ['h', 't', 't', 'p', 's', ':', '/', '/', 'd', 'o', 'c', 's', '.', 'p', 'y', 't', 'h', 'o', 'n', '.', 'o', 'r', 'g', '/', '/', 'w', 'h', 'a', 't', 's', 'n', 'e', 'w', '/', '.', '.', 'h', 't', 'm', 'l']
# 08.“w”在正则里面代表匹配从小写a到z,大写A到Z,数字0到9
regular_v8 = re.findall(r"w","https://docs.python.org/3/whatsnew/3.6.html")
print (regular_v8)
#['h', 't', 't', 'p', 's', 'd', 'o', 'c', 's', 'p', 'y', 't', 'h', 'o', 'n', 'o', 'r', 'g', '3', 'w', 'h', 'a', 't', 's', 'n', 'e', 'w', '3', '6', 'h', 't', 'm', 'l']
# 09.“W”在正则里面代表匹配除了字母与数字以外的特殊符号
regular_v9 = re.findall(r"W","https://docs.python.org/3/whatsnew/3.6.html")
print (regular_v9)
# [':', '/', '/', '.', '.', '/', '/', '/', '.', '.']
(2)pandas
推荐一个中文版API
https://www.pypandas.cn/docs/