From b1e4e389f352211593e103d597802409ba4c9763 Mon Sep 17 00:00:00 2001 From: frkhit Date: Sun, 14 Oct 2018 16:56:50 +0800 Subject: [PATCH 1/2] [cctv] add support for cctv.com to download all videos of one channel --- youtube_dl/extractor/cctv.py | 54 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 2 +- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cctv.py b/youtube_dl/extractor/cctv.py index c76f361c6..434e9d896 100644 --- a/youtube_dl/extractor/cctv.py +++ b/youtube_dl/extractor/cctv.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -189,3 +190,56 @@ class CCTVIE(InfoExtractor): 'duration': duration, 'formats': formats, } + + +class CCTVChannelIE(InfoExtractor): + IE_DESC = '央视网 栏目' + _VALID_URL = r'http://tv.cctv.com/lm/(?P[0-9A-Za-z-_]+)/?' + _TESTS = [{ + 'url': 'http://tv.cctv.com/lm/d10fys/', + 'only_matching': True, + }] + + def _entries(self, page, playlist_id): + re_req_item_id = re.compile(r'setItemByid[a-zA-Z0-9]+') + re_req_id_tmp = re.compile(r'videolistByColumnId\?id=[a-zA-Z0-9]+(?=&)') + re_req_id = re.compile(r'(?<=id=)[a-zA-Z0-9]+') + count_per_page = 100 + + req_item_id = re_req_item_id.findall(page)[0] + req_id = re_req_id.findall(re_req_id_tmp.findall(page)[0])[0] + + page = 0 + while True: + page += 1 + + url_template = "http://api.cntv.cn/lanmu/videolistByColumnId" + \ + "?id={}&serviceId=tvcctv&type=0&n={}&t=jsonp&cb={}&p=".format( + req_id, count_per_page, req_item_id) + content = self._download_webpage(url_template + str(page), playlist_id) + if not content: + break + + content = content.rstrip() + req_item_id = re_req_item_id.findall(content)[0] + + video_list = json.loads(content[(len(req_item_id) + 1):-2])["response"]["docs"] + + for content_dict in video_list: + video_id, video_title, video_url = \ + content_dict["videoId"], content_dict["videoTitle"], content_dict["videoUrl"] + yield self.url_result(video_url, ie="CCTV", video_id=video_id, video_title=video_title) + + if len(video_list) < count_per_page: + break + + def _real_extract(self, url): + channel_id = self._match_id(url) + + channel_page = self._download_webpage( + url, channel_id, + 'Downloading channel page', fatal=False) + if channel_page is False: + raise Exception('CCTV said: Cannot connect to {}'.format(url), expected=True) + + return self.playlist_result(self._entries(channel_page, channel_id), channel_id) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 464c8d690..10c0a3407 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -183,7 +183,7 @@ from .cbsnews import ( from .cbssports import CBSSportsIE from .ccc import CCCIE from .ccma import CCMAIE -from .cctv import CCTVIE +from .cctv import CCTVIE, CCTVChannelIE from .cda import CDAIE from .ceskatelevize import ( CeskaTelevizeIE, From 2e6a1babfaefc4493ebcfb6b59f8400a9b41f568 Mon Sep 17 00:00:00 2001 From: frkhit Date: Sun, 14 Oct 2018 18:08:32 +0800 Subject: [PATCH 2/2] [YoutubeDL:Playlist DownloadLog]: Using file_cache to record the progress of playlist-download-task. Download each videos in order; then re-download the broken videos. --- youtube_dl/YoutubeDL.py | 148 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 143 insertions(+), 5 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 38ba43a97..a3847b893 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -9,18 +9,21 @@ import copy import datetime import errno import fileinput +import hashlib import io import itertools import json import locale import operator import os +import pickle import platform import re import shutil import subprocess import socket import sys +import tempfile import time import tokenize import traceback @@ -109,6 +112,119 @@ from .version import __version__ if compat_os_name == 'nt': import ctypes +youtube_dl_url_key = "youtube_dl_url_key" + + +class PlaylistTaskLog(object): + def __init__(self, task_id, tmp_path=None, max_retry_count=5, to_screen=None): + self.to_screen = to_screen or print + self._task_id = task_id + self._log = {} + self._tmp_path = tmp_path or tempfile.gettempdir() + self._log_file = os.path.join(self._tmp_path, "{}.pkl".format(self._task_id)) + self._max_retry_count = max_retry_count + self._pkl_exists_before = False + # init log + self._init_log() + + def _init_log(self): + if not os.path.exists(self._log_file): + with open(self._log_file, "wb"): + pass + self._pkl_exists_before = False + return + + with open(self._log_file, "rb") as f: + self._log = pickle.load(f) + self._pkl_exists_before = True + return + + def _update_config(self): + if not self._log: + if os.path.exists(self._log_file): + os.remove(self._log_file) + return + + with open(self._log_file, "wb") as f: + pickle.dump(self._log, f) + + @classmethod + def create_id_by_ie_result(cls, ie_result): + + def byteify(object_input): + if isinstance(object_input, dict): + str_dict = {byteify(key): byteify(value) for key, value in object_input.items()} + return ";;;".join(["{}:{}".format(key, str_dict[key]) for key in sorted(str_dict.keys())]) + elif isinstance(object_input, (list, tuple)): + return ";;;".join([byteify(element) for element in sorted(object_input)]) + else: + return "{}".format(object_input) + + try: + if youtube_dl_url_key in ie_result: + try: + return hashlib.md5(ie_result[youtube_dl_url_key]).hexdigest() + except Exception: + return hashlib.md5(ie_result[youtube_dl_url_key].encode("utf-8")).hexdigest() + except Exception: + pass + + try: + return hashlib.md5(json.dumps(ie_result, sort_keys=True)).hexdigest() + except TypeError: + return hashlib.md5(byteify(ie_result).encode("utf-8")).hexdigest() + + def __iter__(self): + # python 2 + return self + + def __next__(self): + # Python 3 + def _pop_task_info(): + if not self._log: + return None + + sorted_list = sorted(self._log.keys(), key=lambda x: self._log[x]["count"]) + + return self._log[sorted_list[0]] + + while True: + task_info = _pop_task_info() + + if task_info is None: + raise StopIteration + + return task_info["task"] + + next = __next__ # python 2 + + def commit(self, video_id, success=True): + if video_id not in self._log: + self.to_screen("WARNING: {} not in playlist log!".format(video_id)) + return + + if success is True: + self._log.pop(video_id) + else: + self._log[video_id]["count"] += 1 + if self._log[video_id]["count"] > self._max_retry_count: + self._log.pop(video_id) + + self._update_config() + # self.to_screen("DEBUG: PlaylistTaskLog[{} left] is {}".format(len(self._log), self._log)) + self.to_screen("DEBUG: left {} videos".format(len(self._log))) + + def add_task(self, number, n_entries, entry, extra): + video_id = "{}".format(number) + + if not self._pkl_exists_before and video_id not in self._log: + self._log[video_id] = { + "count": 0, + "task": {"vid": video_id, "number": number, "n_entries": n_entries, "entry": entry, "extra": extra} + } + + self._update_config() + class YoutubeDL(object): """YoutubeDL class. @@ -800,6 +916,12 @@ class YoutubeDL(object): } self.add_default_extra_info(ie_result, ie, url) if process: + try: + if youtube_dl_url_key not in ie_result: + ie_result[youtube_dl_url_key] = url + except Exception: + pass + return self.process_ie_result(ie_result, download, extra_info) else: return ie_result @@ -972,8 +1094,10 @@ class YoutubeDL(object): x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + _download_log = PlaylistTaskLog(task_id=PlaylistTaskLog.create_id_by_ie_result(ie_result), + to_screen=self.to_screen) + for i, entry in enumerate(entries, 1): - self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) # This __x_forwarded_for_ip thing is a bit ugly but requires # minimal changes if x_forwarded_for: @@ -997,10 +1121,24 @@ class YoutubeDL(object): self.to_screen('[download] ' + reason) continue - entry_result = self.process_ie_result(entry, - download=download, - extra_info=extra) - playlist_results.append(entry_result) + _download_log.add_task(number=i, n_entries=len(entries), entry=entry, extra=extra) + + # try to download all videos + for task in _download_log: + self.to_screen('[download] Downloading No.{} in {} videos'.format(task["number"], task["n_entries"])) + + try: + entry_result = self.process_ie_result(task["entry"], + download=download, + extra_info=task["extra"]) + except Exception as e: + self.to_stderr("ERROR: {}".format(e)) + _download_log.commit(video_id=task["vid"], success=False) + else: + playlist_results.append(entry_result) + + _download_log.commit(video_id=task["vid"], success=True) + ie_result['entries'] = playlist_results self.to_screen('[download] Finished downloading playlist: %s' % playlist) return ie_result