爬取豆瓣电影排行top250

发布时间:2019-05-30 21:09:49编辑:auto阅读(1938)

    功能描述V1.0:

    爬取豆瓣电影排行top250

    功能分析:

    使用的库

    1、time

    2、json

    3、requests

    4、BuautifulSoup

    5、RequestException

    上机实验室:

    """
        作者:李舵
        日期:2019-4-27
        功能:抓取豆瓣电影top250
        版本:V1.0
    """
    
    import time
    import json
    import requests
    from bs4 import BeautifulSoup
    from requests.exceptions import RequestException
    
    
    def get_one_page(url):
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'}
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    
    def parse_one_page(html):
        soup = BeautifulSoup(html, 'lxml')
        ol_list = soup.find('ol', {'class': 'grid_view'})
        li_list = ol_list.find_all('li')
        for i in range(25):
            move_value = li_list[i]
            yield {
                'index': move_value.find('em', {'class': ''}).text.strip(),
                'title': move_value.find('span', {'class': 'title'}).text.strip(),
                'actor': move_value.find('p', {'class': ''}).text.strip(),
                'score': move_value.find('span', {'class': 'rating_num'}).text.strip()
            }
    
    
    def write_to_file(content):
        with open('result.txt', 'a', encoding='utf-8') as f:
            print(type(json.dumps(content)))
            f.write(json.dumps(content, ensure_ascii=False)+'\n')
    
    
    def main(start):
        url = 'https://movie.douban.com/top250?start=' + str(start)
        html = get_one_page(url)
        for item in parse_one_page(html):
            print(item)
            write_to_file(item)
    
    
    if __name__ == '__main__':
        for i in range(0,250,25):
            main(start=i)
            time.sleep(1)
    

     

    功能描述V2.0:

    爬取豆瓣电影排行top250

    功能分析:

    使用的库

    1、time

    2、requests

    3、RequestException

    上机实验室:

    """
    作者:李舵
    日期:2019 - 4 - 8
    功能:抓取豆瓣电影top250
    版本:V2.0
    """
    
    import re
    import time
    import requests
    from requests.exceptions import RequestException
    
    
    def get_one_page(url):
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    
    def parse_one_page(html):
        pattern = re.compile(u'<div.*?class="item">.*?'
                                + u'<div.*?class="pic">.*?'
                                + u'<em.*?class="">(.*?)</em>.*?'
                                + u'<div.*?class="info">.*?'
                                + u'<span.*?class="title">(.*?)</span>.*?'
                                + u'<span.*?class="other">(.*?)</span>.*?'
                                + u'<div.*?class="bd">.*?'
                                + u'<p.*?class="">.*?'
                                + u'导演:\s(.*?)\s.*?<br>'
                                + u'(.*?) / '
                                + u'(.*?) / (.*?)</p>.*?'
                                + u'<div.*?class="star">.*?'
                                + u'<span.*?class="rating_num".*?property="v:average">'
                                + u'(.*?)</span>.*?'
                                + u'<span>(.*?)人评价</span>.*?'
                                + u'<span.*?class="inq">(.*?)</span>', re.S)
        movies = re.findall(pattern, html)
        movie_list = []
        for movie in movies:
            movie_list.append([movie[0],
                               movie[1],
                               movie[2].lstrip(' / '),
                               movie[3],
                               movie[4].lstrip(),
                               movie[5],
                               movie[6].strip(),
                               movie[7],
                               movie[8],
                               movie[9]])
        return movie_list
    
    
    def write_to_file(movie_list):
        with open('top_250.txt', 'w', encoding='utf-8',) as f:
            for movie in movie_list:
                f.write('电影排名:' + movie[0] + '\n')
                f.write('电影名称:' + movie[1] + '\n')
                f.write('电影别名:' + movie[2] + '\n')
                f.write('导演:' + movie[3] + '\n')
                f.write('上映年份:' + movie[4] + '\n')
                f.write('制作国家/地区:' + movie[5] + '\n')
                f.write('电影类别:' + movie[6] + '\n')
                f.write('评分:' + movie[7] + '\n')
                f.write('参评人数:' + movie[8] + '\n')
                f.write('简短影评:' + movie[9] + '\n')
                f.write('\n')
            print('成功写入文件,共有%d条记录……' % len(movie_list))
            f.close()
    
    
    def main(start):
        url = 'https://movie.douban.com/top250?start=' + str(start)
        html = get_one_page(url)
        movie_list = parse_one_page(html)
        write_to_file(movie_list)
    
    
    if __name__ == '__main__':
        for i in range(0, 250, 25):
            main(start=i)
            time.sleep(1)
    

      

    补充说明:

    1、

关键字

上一篇: Python基础之列表

下一篇: Python库的安装