diff --git a/vo-scraper.py b/vo-scraper.py index a4ae064..7afd47a 100755 --- a/vo-scraper.py +++ b/vo-scraper.py @@ -51,12 +51,15 @@ gitlab_issue_page = gitlab_repo_page+"issues" gitlab_changelog_page = gitlab_repo_page+"-/tags/v" remote_version_link = gitlab_repo_page+"raw/master/VERSION" -program_version = '1.1' +program_version = '1.2' # For web requests user_agent = 'Mozilla/5.0' cookie_jar = requests.cookies.RequestsCookieJar() +# Store video sources in global list +video_src_collection = list() + # For stats link_counter = 0 download_counter = 0 @@ -65,21 +68,24 @@ # series_metadata_suffix = ".series-metadata.json" video_info_prefix = "https://video.ethz.ch/.episode-video.json?recordId=" -directory_prefix = "Lecture Recordings/" +directory_prefix = "Lecture Recordings" + os.sep # Default quality video_quality = "high" +# Boolean flags download_all = False verbose = False - print_src = False + +# Location of text files file_to_print_src_to = "" +history_file = "" quality_dict = { - 'low' : 0, + 'high' : 0, 'medium': 1, - 'high' : 2 + 'low' : 2 } class bcolors: @@ -207,20 +213,32 @@ def pretty_print_episodes(vo_json_data, selected): """Prints the episode numbers that match `selected`""" # Get length of longest strings for nice formatting when printing nr_length = len(" Nr.") + max_date_length = max([len(str(episode['createdAt'][:-6])) for episode in vo_json_data['episodes']]) max_title_length = max([len(episode['title']) for episode in vo_json_data['episodes']]) max_lecturer_length = max([len(str(episode['createdBy'])) for episode in vo_json_data['episodes']]) + # Print header + print_information( + " Nr." + + " | " + + "Date".ljust(max_date_length) + + " | " + + "Name".ljust(max_title_length) + + " | " + + "Lecturer".ljust(max_lecturer_length) + ) + # Print the selected episodes for episode_nr in selected: episode = vo_json_data['episodes'][episode_nr] print_information( "%3d".ljust(nr_length) % episode_nr + " | " + + episode['createdAt'][:-6].ljust(max_date_length) + + " | " + episode['title'].ljust(max_title_length) + " | " + str(episode['createdBy']).ljust(max_lecturer_length) - + " | " + - episode['createdAt'][:-6] ) @@ -228,12 +246,15 @@ def vo_scrapper(vo_link, user, passw): """ Gets the list of all available videos for a lecture. Allows user to select multiple videos. - Afterwards passes the links to the video source to `downloader()` + Returns the selected episodes Keyword arguments: vo_link -- The link to the lecture user -- The username passed from a text file passw -- The password passed from a text file + + Returns: + A tuple consisting out of the filename and the video_src_link """ global user_agent global download_all @@ -242,9 +263,6 @@ def vo_scrapper(vo_link, user, passw): global quality_dict global cookie_jar - global print_src - global file_to_print_src_to - global series_metadata_suffix global video_info_prefix global directory_prefix @@ -284,7 +302,7 @@ def vo_scrapper(vo_link, user, passw): # Print the user's choice if not choice: print_information("No videos selected") - return # Nothing to do anymore + return list() # Nothing to do anymore else: print_information("You selected:") pretty_print_episodes(vo_json_data, choice) @@ -300,7 +318,9 @@ def vo_scrapper(vo_link, user, passw): print_information("Keyboard interrupt detected, skipping lecture", type='warning') return - # Collect links and download them + local_video_src_collection = list() + + # Collect links for download for item_nr in choice: # Get link to video metadata json file item = vo_json_data['episodes'][item_nr] @@ -331,88 +351,122 @@ def vo_scrapper(vo_link, user, passw): versions.append((counter, vid_version['res']['w']*vid_version['res']['h'])) print_information(str(counter) + ": " + "%4d" %vid_version['res']['w'] + "x" + "%4d" %vid_version['res']['h'], verbose_only=True) counter += 1 - versions.sort(key=lambda tup: tup[1]) - # Now it's sorted: low -> medium -> high + versions.sort(key=lambda tup: tup[1], reverse=True) + # Now it's sorted: high -> medium -> low # Get video src url from json - video_src_link = video_json_data['streams'][0]['sources']['mp4'][versions[quality_dict[video_quality]][0]]['src'] + try: # try/except block to handle cases were not all three types of quality exist + video_src_link = video_json_data['streams'][0]['sources']['mp4'][versions[quality_dict[video_quality]][0]]['src'] + except IndexError: + print_information("Requested quality \"" + video_quality + "\" not available. Skipping episode!", type='error') + continue - lecture_titel = vo_json_data['title'] - video_title = vo_json_data["episodes"][item_nr]["title"] + lecture_title = vo_json_data['title'] + episode_title = vo_json_data["episodes"][item_nr]["title"] # If video and lecture title overlap, remove lecture title from video title - if video_title.startswith(lecture_titel): - video_title = video_title[len(lecture_titel):] - # Append date - video_title = item['createdAt'][:-6]+video_title + if episode_title.startswith(lecture_title): + episode_title = episode_title[len(lecture_title):] - # Create directory for video if it does not already exist - directory = directory_prefix + lecture_titel +"/" - if not os.path.isdir(directory): - os.makedirs(directory) - print_information("This folder was generated: " + directory, verbose_only=True) - else: - print_information("This folder already exists: " + directory, verbose_only=True) + # Extract episode name before adding the date to episode_title + episode_name = item['createdAt'][:-6] + " " + lecture_title + episode_title + + # Append date + episode_title = item['createdAt'][:-6]+episode_title # Filename is `directory/