
爬取某电影网站排名前100的电影
# -*- coding:utf-8 -*-
#获取要爬取的html
import re
from multiprocessing.pool import Pool
import requests
def get_html_page(url):
#添加请求头,模拟网络
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
try:
# 采用requests进行访问网络
response = (url,headers = headers)
if response.status_code == 200:
return None
except Exception as e:
print(e)
return None
def parse_html_page(html):
#正则表达式表示好难啊,这个还是用别人的,自己还得继续加油学习
pattern = repile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a'