可参看
抓取Youtube列表数据的前提是scrapy部署的机器可以正常访问Youtube网站
存取到Mongo中的数据如下:
{ "playlist_id" : "PLEbPmOCXPYV67l45xFBdmodrPkhzuwSe9", "videos" : [ { "playlist_id" : "PLEbPmOCXPYV67l45xFBdmodrPkhzuwSe9", "video_id" : "9pTwztLOvj4", "thumbnail" : [ { "url" : "https://i.ytimg.com/vi/9pTwztLOvj4/hqdefault.jpg?sqp=-oaymwEZCPYBEIoBSFXyq4qpAwsIARUAAIhCGAFwAQ==&rs=AOn4CLCmUXUPe-HgXiie0SRfL5cYz0JRrg", "width" : 245, "height" : 137 } ], "title" : "Legend of the galactic heroes (1988) episode 1", "index" : 1, "length_seconds" : 1445, "is_playable" : true }, { "playlist_id" : "PLEbPmOCXPYV67l45xFBdmodrPkhzuwSe9", "video_id" : "zzD1xU37Vtc", "thumbnail" : [ { "url" : "https://i.ytimg.com/vi/zzD1xU37Vtc/hqdefault.jpg?sqp=-oaymwEZCPYBEIoBSFXyq4qpAwsIARUAAIhCGAFwAQ==&rs=AOn4CLCnLCYaZVBeHnZR0T73rfEd_Dbyew", "width" : 245, "height" : 137 } ], "title" : "Legend of the galactic heroes (1988) episode 2", "index" : 2, "length_seconds" : 1447, "is_playable" : true },
代码如下:
# -*- coding: utf-8 -*-import scrapyimport reimport jsonfrom scrapy import Selectorfrom knowsmore.items import YoutubePlaylistItem, YoutubePlaylistVideoItemfrom ..common import *class YoutubeListSpider(scrapy.Spider): name = 'youtube_list' allowed_domains = ['www.youtube.com'] start_urls = ['https://www.youtube.com/playlist?list=PLEbPmOCXPYV67l45xFBdmodrPkhzuwSe9'] def parse(self, response): # Extract JSON Data with Regex Expression ytInitialData = r1(r'window\["ytInitialData"\] = (.*?)}};', response.body) if ytInitialData: ytInitialData = '%s}}' % ytInitialData ytInitialDataObj = json.loads(ytInitialData) # Assign VideoList info to variable playListInfo = ytInitialDataObj['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer'] # Build Scrapy Item playList = YoutubePlaylistItem( playlist_id = playListInfo['playlistId'], videos = [] ) # Insert the videoItem to YoutubePlaylistItem videos field for videoInfo in playListInfo['contents']: videoInfo = videoInfo['playlistVideoRenderer'] videoItem = YoutubePlaylistVideoItem( playlist_id = playListInfo['playlistId'], video_id = videoInfo['videoId'], thumbnail = videoInfo['thumbnail']['thumbnails'], title = videoInfo['title']['simpleText'], index = videoInfo['index']['simpleText'], length_seconds = videoInfo['lengthSeconds'], is_playable = videoInfo['isPlayable'] ) playList['videos'].append(videoItem) yield playList