diff --git a/README.md b/README.md index c0ffa6b..eb7fcad 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,12 @@ 1. [Download your Twitter archive](https://twitter.com/settings/download_your_data) (Settings > Your account > Download an archive of your data). 2. Unzip to a folder. 3. Right-click this link --> [parser.py](https://raw.githubusercontent.com/timhutton/twitter-archive-parser/main/parser.py) <-- and select "Save Link as", and save into the folder where you extracted the archive. (Or use wget or curl on that link. Or clone the git repo.) -4. Run parser.py with [Python 3](https://realpython.com/installing-python/). e.g. `python parser.py` from a command prompt opened in that folder. +4. Open a command prompt and change directory into the unzipped folder where you just saved parser.py. + (**Here's how to do that on Windows:** Hold shift while right-clicking in the folder. Click on `Open PowerShell`.) +5. Run parser.py with [Python 3](https://realpython.com/installing-python/). e.g. `python parser.py`. + (**On Windows:** When the command window opens, paste or enter `python parser.py` at the command prompt.) + + If you are having problems please check the [issues list](https://github.com/timhutton/twitter-archive-parser/issues?q=is%3Aissue) to see if it has happened before, and open a new issue otherwise. @@ -21,7 +26,7 @@ Our script does the following: - Replaces t.co URLs with their original versions (the ones that can be found in the archive). - Copies used images to an output folder, to allow them to be moved to a new home. - Will query Twitter for the missing user handles (checks with you first). -- Converts DMs to markdown, including the handles that we retrieved. Basic functionality for now (no embedded images), pending improvements. +- Converts DMs (including group DMs) to markdown with embedded media and links, including the handles that we retrieved. - Outputs lists of followers and following. - Downloads the original size images (checks with you first). @@ -30,6 +35,7 @@ Our script does the following: Some of the functionality requires the `requests` and `imagesize` modules. `parser.py` will offer to install these for you using pip. To avoid that you can install them before running the script. ## Articles about handling your Twitter archive: +- https://techcrunch.com/2022/11/21/quit-twitter-better-with-these-free-tools-that-make-archiving-a-breeze/ - https://www.bitsgalore.org/2022/11/20/how-to-preserve-your-personal-twitter-archive - https://matthiasott.com/notes/converting-your-twitter-archive-to-markdown diff --git a/parser.py b/parser.py old mode 100755 new mode 100644 index d8f97c9..c0ce179 --- a/parser.py +++ b/parser.py @@ -18,6 +18,8 @@ """ from collections import defaultdict +import math +from typing import Optional from urllib.parse import urlparse import datetime import glob @@ -30,6 +32,8 @@ import subprocess import sys import time +import traceback +from typing import List # hot-loaded if needed, see import_module(): # imagesize # requests @@ -41,24 +45,138 @@ class UserData: - def __init__(self, id, handle = None): - self.id = id + def __init__(self, user_id: str, handle: str): + if user_id is None: + raise ValueError('ID "None" is not allowed in UserData.') + self.user_id = user_id + if handle is None: + raise ValueError('handle "None" is not allowed in UserData.') self.handle = handle +class PathConfig: + """ + Helper class containing constants for various directories and files. + + The script will only add / change / delete content in its own directories, which start with `parser-`. + Files within `parser-output` are the end result that the user is probably interested in. + Files within `parser-cache` are temporary working files, which improve the efficiency if you run + this script multiple times. They can safely be removed without harming the consistency of the + files within `parser-output`. + """ + def __init__(self, dir_archive): + self.dir_archive = dir_archive + self.dir_input_data = os.path.join(dir_archive, 'data') + self.file_account_js = os.path.join(self.dir_input_data, 'account.js') + + # check if user is in correct folder + if not os.path.isfile(self.file_account_js): + print( + f'Error: Failed to load {self.file_account_js}. ' + f'Start this script in the root folder of your Twitter archive.') + exit() + + self.dir_input_media = find_dir_input_media(self.dir_input_data) + self.dir_output = os.path.join(self.dir_archive, 'parser-output') + self.dir_output_media = os.path.join(self.dir_output, 'media') + self.dir_output_cache = os.path.join(self.dir_archive, 'parser-cache') + self.file_output_following = os.path.join(self.dir_output, 'following.txt') + self.file_output_followers = os.path.join(self.dir_output, 'followers.txt') + self.file_download_log = os.path.join(self.dir_output_media, 'download_log.txt') + self.file_tweet_icon = os.path.join(self.dir_output_media, 'tweet.ico') + self.files_input_tweets = find_files_input_tweets(self.dir_input_data) + + # structured like an actual tweet output file, can be used to compute relative urls to a media file + self.example_file_output_tweets = self.create_path_for_file_output_tweets(year=2020, month=12) + + def create_path_for_file_output_tweets(self, year, month, format="html", kind="tweets") -> str: + """Builds the path for a tweet-archive file based on some properties.""" + # Previously the filename was f'{dt.year}-{dt.month:02}-01-Tweet-Archive-{dt.year}-{dt.month:02}' + return os.path.join(self.dir_output, f"{kind}-{format}", f"{year:04}", f"{year:04}-{month:02}-01-{kind}.{format}") + + def create_path_for_file_output_dms(self, name: str, index: Optional[int]=None, format: str="html", kind: str="DMs") -> str: + """Builds the path for a dm-archive file based on some properties.""" + index_suffix = "" + if (index): + index_suffix = f"-part{index:03}" + return os.path.join(self.dir_output, kind, f"{kind}-{name}{index_suffix}.{format}") + + def create_path_for_file_output_single(self, format: str, kind: str)->str: + """Builds the path for a single output file which, i.e. one that is not part of a larger group or sequence.""" + return os.path.join(self.dir_output, f"{kind}.{format}") + + +def format_duration(seconds: float) -> str: + duration_datetime: datetime.datetime = \ + datetime.datetime.fromtimestamp( + seconds, + tz=datetime.timezone.utc + ) + if duration_datetime.hour >= 1: + return f"{duration_datetime.hour } hour{ '' if duration_datetime.hour == 1 else 's'} " \ + f"{duration_datetime.minute} minute{'' if duration_datetime.minute == 1 else 's'}" + elif duration_datetime.minute >= 1: + return f"{duration_datetime.minute} minute{'' if duration_datetime.minute == 1 else 's'} " \ + f"{duration_datetime.second} second{'' if duration_datetime.second == 1 else 's'}" + else: + return f"{duration_datetime.second} second{'' if duration_datetime.second == 1 else 's'}" + + +def get_consent(prompt: str, default_to_yes: bool = False): + """Asks the user for consent, using the given prompt. Accepts various versions of yes/no, or + an empty answer to accept the default. The default is 'no' unless default_to_yes is passed as + True. The default will be indicated automatically. For unacceptable answers, the user will + be asked again.""" + if default_to_yes: + suffix = " [Y/n]" + default_answer = "yes" + else: + suffix = " [y/N]" + default_answer = "no" + while True: + user_input = input(prompt + suffix) + if user_input == "": + print (f"Your empty response was assumed to mean '{default_answer}' (the default for this question).") + return default_to_yes + if user_input.lower() in ('y', 'yes'): + return True + if user_input.lower() in ('n', 'no'): + return False + print (f"Sorry, did not understand. Please answer with y, n, yes, no, or press enter to accept " + f"the default (which is '{default_answer}' in this case, as indicated by the uppercase " + f"'{default_answer.upper()[0]}'.)") + + def import_module(module): """Imports a module specified by a string. Example: requests = import_module('requests')""" try: return importlib.import_module(module) except ImportError: print(f'\nError: This script uses the "{module}" module which is not installed.\n') - user_input = input('OK to install using pip? [y/n]') - if not user_input.lower() in ('y', 'yes'): + if not get_consent('OK to install using pip?'): exit() subprocess.run([sys.executable, '-m', 'pip', 'install', module], check=True) return importlib.import_module(module) +def open_and_mkdirs(path_file): + """Opens a file for writing. If the parent directory does not exist yet, it is created first.""" + mkdirs_for_file(path_file) + return open(path_file, 'w', encoding='utf-8') + + +def mkdirs_for_file(path_file): + """Creates the parent directory of the given file, if it does not exist yet.""" + path_dir = os.path.split(path_file)[0] + os.makedirs(path_dir, exist_ok=True) + + +def rel_url(media_path, document_path): + """Computes the relative URL needed to link from `document_path` to `media_path`. + Assumes that `document_path` points to a file (e.g. `.md` or `.html`), not a directory.""" + return os.path.relpath(media_path, os.path.split(document_path)[0]).replace("\\", "/") + + def get_twitter_api_guest_token(session, bearer_token): """Returns a Twitter API guest token for the current session.""" guest_token_response = session.post("https://api.twitter.com/1.1/guest/activate.json", @@ -71,6 +189,8 @@ def get_twitter_api_guest_token(session, bearer_token): return guest_token +# TODO if downloading fails within the for loop, we should be able to return the already +# fetched users, but also make it clear that it is incomplete. Maybe do it like in get_tweets. def get_twitter_users(session, bearer_token, guest_token, user_ids): """Asks Twitter for all metadata associated with user_ids.""" users = {} @@ -91,34 +211,48 @@ def get_twitter_users(session, bearer_token, guest_token, user_ids): users[user["id_str"]] = user return users + def get_tweets(session, bearer_token, guest_token, tweet_ids, include_user=True, include_alt_text=True): - """ Get the json metadata for a multiple tweets. - If include_user is False, you will only get a numerical id for the user.""" + """Get the json metadata for multiple tweets. + If include_user is False, you will only get a numerical id for the user. + Returns `tweets, remaining_tweet_ids` where `tweets`. If all goes well, `tweets` will contain all + tweets, and `remaining_tweet_ids` is empty. If something goes wrong, downloading is stopped + and only the tweets we got until then are returned. + TODO In some cases, up to 100 tweets may be both in `tweets` and `remaining_tweet_ids`.""" tweets = {} remaining_tweet_ids = tweet_ids.copy() - while remaining_tweet_ids: - max_batch = 100 - tweet_id_batch = remaining_tweet_ids[:max_batch] - tweet_id_list = ",".join(tweet_id_batch) - print(f"Download {len(tweet_id_batch)} tweets of {len(remaining_tweet_ids)} remaining...") - query_url = f"https://api.twitter.com/1.1/statuses/lookup.json?id={tweet_id_list}&tweet_mode=extended" - if not include_user: - query_url += "&trim_user=1" - if include_alt_text: - query_url += "&include_ext_alt_text=1" - response = session.get(query_url, - headers={'authorization': f'Bearer {bearer_token}', 'x-guest-token': guest_token}) - if response.status_code == 429: - # Rate limit exceeded - get a new token - guest_token = get_twitter_api_guest_token(session, bearer_token) - continue - if not response.status_code == 200: - raise Exception(f'Failed to get tweets: {response}') - response_json = json.loads(response.content) - for tweet in response_json: - tweets[tweet["id_str"]] = tweet - remaining_tweet_ids = remaining_tweet_ids[max_batch:] - return tweets + try: + while remaining_tweet_ids: + max_batch = 100 + tweet_id_batch = remaining_tweet_ids[:max_batch] + tweet_id_list = ",".join(map(str,tweet_id_batch)) + print(f"Download {len(tweet_id_batch)} tweets of {len(remaining_tweet_ids)} remaining...") + query_url = f"https://api.twitter.com/1.1/statuses/lookup.json?id={tweet_id_list}&tweet_mode=extended" + if not include_user: + query_url += "&trim_user=1" + if include_alt_text: + query_url += "&include_ext_alt_text=1" + response = session.get(query_url, + headers={'authorization': f'Bearer {bearer_token}', 'x-guest-token': guest_token}, timeout=5) + if response.status_code == 429: + # Rate limit exceeded - get a new token + guest_token = get_twitter_api_guest_token(session, bearer_token) + continue + if not response.status_code == 200: + raise Exception(f'Failed to get tweets: {response}') + response_json = json.loads(response.content) + for tweet in response_json: + if "id_str" in tweet: + tweets[tweet["id_str"]] = tweet + else: + print (f"Tweet could not be returned because it has no id: {tweet}") + remaining_tweet_ids = remaining_tweet_ids[max_batch:] + except Exception as err: + traceback.print_exc() + print(f"Exception during batch download of tweets: {err}"); + print(f"Try to work with the tweets we got so far."); + return tweets, remaining_tweet_ids + def lookup_users(user_ids, users): """Fill the users dictionary with data from Twitter""" @@ -130,9 +264,9 @@ def lookup_users(user_ids, users): # Account metadata observed at ~2.1KB on average. estimated_size = int(2.1 * len(filtered_user_ids)) print(f'{len(filtered_user_ids)} users are unknown.') - user_input = input(f'Download user data from Twitter (approx {estimated_size:,}KB)? [y/n]') - if user_input.lower() not in ('y', 'yes'): + if not get_consent(f'Download user data from Twitter (approx {estimated_size:,} KB)?'): return + requests = import_module('requests') try: with requests.Session() as session: @@ -140,10 +274,13 @@ def lookup_users(user_ids, users): guest_token = get_twitter_api_guest_token(session, bearer_token) retrieved_users = get_twitter_users(session, bearer_token, guest_token, filtered_user_ids) for user_id, user in retrieved_users.items(): - users[user_id] = UserData(user_id, user["screen_name"]) + if user["screen_name"] is not None: + users[user_id] = UserData(user_id=user_id, handle=user["screen_name"]) + print() # empty line for better readability of output except Exception as err: print(f'Failed to download user data: {err}') + def read_json_from_js_file(filename): """Reads the contents of a Twitter-produced .js file into a dictionary.""" print(f'Parsing {filename}...') @@ -162,60 +299,215 @@ def read_json_from_js_file(filename): return json.loads(data) -def extract_username(paths): +def extract_username(paths: PathConfig): """Returns the user's Twitter username from account.js.""" account = read_json_from_js_file(paths.file_account_js) return account[0]['account']['username'] -def collect_tweet_id(tweet): - if 'tweet' in tweet.keys(): - tweet = tweet['tweet'] - return tweet['id_str'] + +def escape_markdown(input_text: str) -> str: + """ + Escape markdown control characters from input text so that the text will not break in rendered markdown. + (Only use on unformatted text parts that do not yet have any markdown control characters added on purpose!) + """ + characters_to_escape: str = r"\_*[]()~`>#+-=|{}.!" + output_text: str = '' + for char in input_text: + if char in characters_to_escape: + # add backslash before control char + output_text = output_text + "\\" + char + elif char == '\n': + # add double space before line break + output_text = output_text + " " + char + else: + output_text = output_text + char + return output_text -def collect_tweet_references(tweet, known_tweet_ids, counts): +def parse_as_number(str_or_number): + """Returns an int if you give it either an int or a str that can be parsed as an int. Otherwise, returns None.""" + if isinstance(str_or_number, str): + if str_or_number.isnumeric(): + return int(str_or_number) + else: + return None + elif isinstance(str_or_number, int): + return str_or_number + else: + return None + + +def equal_ignore_types(a, b): + """Recognizes two things as equal even if one is a str and the other is a number (but with identical content), or if both are lists or both are dicts, and all of their nested values are equal_ignore_types""" + if a == b: + return True + if parse_as_number(a) is not None and parse_as_number(b) is not None: + return parse_as_number(a) == parse_as_number(b) + if isinstance(a, dict) and isinstance (b, dict): + if len(a) != len(b): + return False + for key in a.keys(): + if not equal_ignore_types(a[key], b[key]): + return False + return True + if isinstance(a, list) and isinstance(b, list): + if len(a) != len(b): + return False + for i in range(len(a)): + if not equal_ignore_types(a[i], b[i]): + return False + return True + return False + + +def merge_lists(a: list, b: list, ignore_types:bool=False): + """Adds all items from b to a which are not already in a. If you pass ignore_types=True, it uses equal_ignore_types internally, and also recognizes two list items as equal if they both are dicts with equal id_str values in it, which results in merging the dicts instead of adding both separately to the result. Modifies a and returns a.""" + for item_b in b: + found_in_a = False + if ignore_types: + for item_a in a: + if equal_ignore_types(item_a, item_b): + found_in_a = True + break + if isinstance(item_a, dict) and isinstance(item_b, dict) and has_path(item_a, ['id_str']) and has_path(item_b, ['id_str']) and item_a['id_str'] == item_b['id_str']: + merge_dicts(item_a, item_b) + else: + found_in_a = item_b in a + + if not found_in_a: + a.append(item_b) + return a + + +# Taken from https://stackoverflow.com/a/7205107/39946, then adapted to +# some commonly observed twitter specifics. +def merge_dicts(a, b, path=None): + "merges b into a" + if path is None: path = [] + for key in b: + if key in a: + if isinstance(a[key], dict) and isinstance(b[key], dict): + merge_dicts(a[key], b[key], path + [str(key)]) + elif isinstance(a[key], list) and isinstance(b[key], list): + merge_lists(a[key], b[key], ignore_types=True) + elif a[key] == b[key]: + pass # same leaf value + elif key == 'retweet_count' or key == 'favorite_count': + a[key] = max(parse_as_number(a[key]), parse_as_number(b[key])) + elif key in ['possibly_sensitive']: + # ignore conflicts in unimportant fields that tend to differ + pass + elif parse_as_number(a[key]) == parse_as_number(b[key]): + # Twitter sometimes puts numbers into strings, so that the same number might be 3 or '3' + a[key] = parse_as_number(a[key]) + elif a[key] is None and b[key] is not None: + # just as if `not key in a` + a[key] = b[key] + elif a[key] is not None and b[key] is None: + # Nothing to update + pass + else: + raise Exception(f"Conflict at {'.'.join(path + [str(key)])}, value '{a[key]}' vs. '{b[key]}'") + else: + a[key] = b[key] + return a + + +def unwrap_tweet(tweet): if 'tweet' in tweet.keys(): - tweet = tweet['tweet'] + return tweet['tweet'] + else: + return tweet + + +def add_known_tweet(known_tweets, new_tweet): + tweet_id = new_tweet['id_str'] + if tweet_id in known_tweets: + if known_tweets[tweet_id] == new_tweet: + pass + #print(f"Tweet {tweet_id} was already known with identical contents") + else: + try: + merge_dicts(known_tweets[tweet_id], new_tweet) + except Exception as err: + print(traceback.format_exc()) + print(f"Tweet {tweet_id} could not be merged: {err}") + + else: + #print(f"Tweet {tweet_id} is new") + known_tweets[tweet_id] = new_tweet + + +def collect_tweet_references(tweet, known_tweets, counts): + tweet = unwrap_tweet(tweet) tweet_ids = set() + + # Don't search for tweet references if this tweet was not part of the original archive + if 'from_archive' not in tweet: + return tweet_ids + # Collect quoted tweets - if 'entities' in tweet and 'urls' in tweet['entities']: + if has_path(tweet, ['entities', 'urls']): for url in tweet['entities']['urls']: if 'url' in url and 'expanded_url' in url: expanded_url = url['expanded_url'] matches = re.match(r'^https://twitter.com/([0-9A-Za-z_]*)/status/(\d+)$', expanded_url) if (matches): #user_handle = matches[1] - tweet_ids.add(matches[2]) - counts['quote'] += 1 + quoted_id = matches[2] + if (quoted_id in known_tweets): + counts['known_quote'] += 1 + else: + tweet_ids.add(quoted_id) + counts['quote'] += 1 # Collect previous tweets in conversation - if 'in_reply_to_status_id_str' in tweet: - if (tweet['in_reply_to_status_id_str'] in known_tweet_ids): + # Only do this for tweets from our original archive + if 'from_archive' in tweet and has_path(tweet, ['in_reply_to_status_id_str']): + prev_tweet_id = tweet['in_reply_to_status_id_str'] + if (prev_tweet_id in known_tweets): counts['known_reply'] += 1 else: - tweet_ids.add(tweet['in_reply_to_status_id_str']) + tweet_ids.add(prev_tweet_id) counts['reply'] += 1 # Collect retweets - if 'full_text' in tweet and tweet['full_text'].startswith('RT @'): + # Don't do this if we already re-downloaded this tweet + if not 'from_api' in tweet and 'full_text' in tweet and tweet['full_text'].startswith('RT @'): tweet_ids.add(tweet['id_str']) counts['retweet'] += 1 # Collect tweets with media, which might lack alt text # TODO we might filter for media which has "type" : "photo" because there is no alt text for videos - if 'entities' in tweet and 'media' in tweet['entities']: + # Don't do this if we already re-downloaded this tweet with alt texts enabled + if not 'download_with_alt_text' in tweet and has_path(tweet, ['entities', 'media']): tweet_ids.add(tweet['id_str']) counts['media'] += 1 + if None in tweet_ids: + raise Exception(f"Tweet has id None: {tweet}") + return tweet_ids -def convert_tweet(tweet, username, media_sources, users, referenced_tweets, paths): + +def has_path(dict, index_path: List[str]): + """Walks a path through nested dicts or lists, and returns True if all the keys are present, and all of the values are not None.""" + for index in index_path: + if not index in dict: + return False + dict = dict[index] + if dict is None: + return False + return True + + +def convert_tweet(tweet, username, media_sources: dict, users, referenced_tweets, paths: PathConfig): """Converts a JSON-format tweet. Returns tuple of timestamp, markdown and HTML.""" # TODO actually use `referenced_tweets` - if 'tweet' in tweet.keys(): - tweet = tweet['tweet'] + tweet = unwrap_tweet(tweet) timestamp_str = tweet['created_at'] - timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) # Example: Tue Mar 19 14:05:17 +0000 2019 + timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) + # Example: Tue Mar 19 14:05:17 +0000 2019 body_markdown = tweet['full_text'] body_html = tweet['full_text'] tweet_id_str = tweet['id_str'] @@ -223,30 +515,36 @@ def convert_tweet(tweet, username, media_sources, users, referenced_tweets, path # added to the urls entities list so that we can build correct links later on. if 'entities' in tweet and 'media' not in tweet['entities'] and len(tweet['entities'].get("urls", [])) == 0: for word in tweet['full_text'].split(): - url = urlparse(word) - if url.scheme != '' and url.netloc != '' and not word.endswith('\u2026'): - # Shorten links similiar to twitter - netloc_short = url.netloc[4:] if url.netloc.startswith("www.") else url.netloc - path_short = url.path if len(url.path + '?' + url.query) < 15 else (url.path + '?' + url.query)[:15] + '\u2026' - tweet['entities']['urls'].append({ - 'url': word, - 'expanded_url': word, - 'display_url': netloc_short + path_short, - 'indices': [tweet['full_text'].index(word), tweet['full_text'].index(word) + len(word)], - }) + try: + url = urlparse(word) + except ValueError: + pass # don't crash when trying to parse something that looks like a URL but actually isn't + else: + if url.scheme != '' and url.netloc != '' and not word.endswith('\u2026'): + # Shorten links similar to twitter + netloc_short = url.netloc[4:] if url.netloc.startswith("www.") else url.netloc + path_short = url.path if len(url.path + '?' + url.query) < 15 \ + else (url.path + '?' + url.query)[:15] + '\u2026' + tweet['entities']['urls'].append({ + 'url': word, + 'expanded_url': word, + 'display_url': netloc_short + path_short, + 'indices': [tweet['full_text'].index(word), tweet['full_text'].index(word) + len(word)], + }) # replace t.co URLs with their original versions - if 'entities' in tweet and 'urls' in tweet['entities']: + if has_path(tweet, ['entities', 'urls']): for url in tweet['entities']['urls']: if 'url' in url and 'expanded_url' in url: expanded_url = url['expanded_url'] body_markdown = body_markdown.replace(url['url'], expanded_url) expanded_url_html = f'{expanded_url}' body_html = body_html.replace(url['url'], expanded_url_html) - # if the tweet is a reply, construct a header that links the names of the accounts being replied to the tweet being replied to + # if the tweet is a reply, construct a header that links the names + # of the accounts being replied to the tweet being replied to header_markdown = '' header_html = '' if 'in_reply_to_status_id' in tweet: - # match and remove all occurences of '@username ' at the start of the body + # match and remove all occurrences of '@username ' at the start of the body replying_to = re.match(r'^(@[0-9A-Za-z_]* )*', body_markdown)[0] if replying_to: body_markdown = body_markdown[len(replying_to):] @@ -261,10 +559,14 @@ def convert_tweet(tweet, username, media_sources, users, referenced_tweets, path name_list = ', '.join(names[:-1]) + (f' and {names[-1]}' if len(names) > 1 else names[0]) in_reply_to_status_id = tweet['in_reply_to_status_id'] replying_to_url = f'https://twitter.com/{in_reply_to_screen_name}/status/{in_reply_to_status_id}' - header_markdown += f'Replying to [{name_list}]({replying_to_url})\n\n' + header_markdown += f'Replying to [{escape_markdown(name_list)}]({replying_to_url})\n\n' header_html += f'Replying to {name_list}
' + # escape tweet body for markdown rendering: + body_markdown = escape_markdown(body_markdown) # replace image URLs with image links to local files - if 'entities' in tweet and 'media' in tweet['entities'] and 'extended_entities' in tweet and 'media' in tweet['extended_entities']: + if has_path(tweet, ['entities', 'media']) and has_path(tweet, ['extended_entities', 'media']) \ + and len(tweet['entities']['media']) > 0 and 'url' in tweet['entities']['media'][0]: + original_url = tweet['entities']['media'][0]['url'] markdown = '' html = '' @@ -275,9 +577,10 @@ def convert_tweet(tweet, username, media_sources, users, referenced_tweets, path archive_media_filename = tweet_id_str + '-' + original_filename archive_media_path = os.path.join(paths.dir_input_media, archive_media_filename) file_output_media = os.path.join(paths.dir_output_media, archive_media_filename) - media_url = f'{os.path.split(paths.dir_output_media)[1]}/{archive_media_filename}' - markdown += '' if not markdown and body_markdown == original_url else '\n\n' + media_url = rel_url(file_output_media, paths.example_file_output_tweets) + markdown += '' if not markdown and body_markdown == escape_markdown(original_url) else '\n\n' html += '' if not html and body_html == original_url else '
' + # if file exists, this means that file is probably an image (not a video) if os.path.isfile(archive_media_path): # Found a matching image, use this one if not os.path.isfile(file_output_media): @@ -286,20 +589,25 @@ def convert_tweet(tweet, username, media_sources, users, referenced_tweets, path html += f'' # Save the online location of the best-quality version of this file, for later upgrading if wanted best_quality_url = f'https://pbs.twimg.com/media/{original_filename}:orig' - media_sources.append((os.path.join(paths.dir_output_media, archive_media_filename), best_quality_url)) + media_sources[os.path.join(paths.dir_output_media, archive_media_filename)] = best_quality_url else: + # If the file does not exists, it might be a video. Then its filename might + # be found like this: # Is there any other file that includes the tweet_id in its filename? archive_media_paths = glob.glob(os.path.join(paths.dir_input_media, tweet_id_str + '*')) if len(archive_media_paths) > 0: for archive_media_path in archive_media_paths: archive_media_filename = os.path.split(archive_media_path)[-1] file_output_media = os.path.join(paths.dir_output_media, archive_media_filename) - media_url = f'{os.path.split(paths.dir_output_media)[1]}/{archive_media_filename}' + media_url = rel_url(file_output_media, paths.example_file_output_tweets) if not os.path.isfile(file_output_media): shutil.copy(archive_media_path, file_output_media) - markdown += f'\n' - html += f'\n' - # Save the online location of the best-quality version of this file, for later upgrading if wanted + markdown += f'\n' + html += f'\n' + # Save the online location of the best-quality version of this file, + # for later upgrading if wanted if 'video_info' in media and 'variants' in media['video_info']: best_quality_url = '' best_bitrate = -1 # some valid videos are marked with bitrate=0 in the JSON @@ -310,41 +618,50 @@ def convert_tweet(tweet, username, media_sources, users, referenced_tweets, path best_quality_url = variant['url'] best_bitrate = bitrate if best_bitrate == -1: - print(f"Warning No URL found for {original_url} {original_expanded_url} {archive_media_path} {media_url}") + print(f"Warning No URL found for {original_url} {original_expanded_url} " + f"{archive_media_path} {media_url}") print(f"JSON: {tweet}") else: - media_sources.append((os.path.join(paths.dir_output_media, archive_media_filename), best_quality_url)) + media_sources[os.path.join(paths.dir_output_media, archive_media_filename)] = best_quality_url else: - print(f'Warning: missing local file: {archive_media_path}. Using original link instead: {original_url} (expands to {original_expanded_url})') + print(f'Warning: missing local file: {archive_media_path}. Using original link instead: ' + f'{original_url} (expands to {original_expanded_url})') markdown += f'![]({original_url})' html += f'{original_url}' - body_markdown = body_markdown.replace(original_url, markdown) + body_markdown = body_markdown.replace(escape_markdown(original_url), markdown) body_html = body_html.replace(original_url, html) # make the body a quote body_markdown = '> ' + '\n> '.join(body_markdown.splitlines()) body_html = '

' + '
\n'.join(body_html.splitlines()) + '
' # append the original Twitter URL as a link original_tweet_url = f'https://twitter.com/{username}/status/{tweet_id_str}' - body_markdown = header_markdown + body_markdown + f'\n\n [{timestamp_str}]({original_tweet_url})' - body_html = header_html + body_html + f' {timestamp_str}

' + icon_url = rel_url(paths.file_tweet_icon, paths.example_file_output_tweets) + body_markdown = header_markdown + body_markdown + f'\n\n ' \ + f'[{timestamp_str}]({original_tweet_url})' + body_html = header_html + body_html + f' {timestamp_str}

' # extract user_id:handle connections - if 'in_reply_to_user_id' in tweet and 'in_reply_to_screen_name' in tweet: - id = tweet['in_reply_to_user_id'] - if int(id) >= 0: # some ids are -1, not sure why + if 'in_reply_to_user_id' in tweet and 'in_reply_to_screen_name' in tweet and \ + tweet['in_reply_to_screen_name'] is not None: + reply_to_id = tweet['in_reply_to_user_id'] + if int(reply_to_id) >= 0: # some ids are -1, not sure why handle = tweet['in_reply_to_screen_name'] - users[id] = UserData(id=id, handle=handle) - if 'entities' in tweet and 'user_mentions' in tweet['entities']: + users[reply_to_id] = UserData(user_id=reply_to_id, handle=handle) + if 'entities' in tweet and 'user_mentions' in tweet['entities'] and tweet['entities']['user_mentions'] is not None: for mention in tweet['entities']['user_mentions']: - id = mention['id'] - if int(id) >= 0: # some ids are -1, not sure why - handle = mention['screen_name'] - users[id] = UserData(id=id, handle=handle) + if mention is not None and 'id' in mention and 'screen_name' in mention: + mentioned_id = mention['id'] + if int(mentioned_id) >= 0: # some ids are -1, not sure why + handle = mention['screen_name'] + if handle is not None: + users[mentioned_id] = UserData(user_id=mentioned_id, handle=handle) return timestamp, body_markdown, body_html def find_files_input_tweets(dir_path_input_data): - """Identify the tweet archive's file and folder names - they change slightly depending on the archive size it seems.""" + """Identify the tweet archive's file and folder names - + they change slightly depending on the archive size it seems.""" input_tweets_file_templates = ['tweet.js', 'tweets.js', 'tweets-part*.js'] files_paths_input_tweets = [] for input_tweets_file_template in input_tweets_file_templates: @@ -387,11 +704,13 @@ def download_file_if_larger(url, filename, index, count, sleep_time): try: with requests.get(url, stream=True, timeout=2) as res: if not res.status_code == 200: - # Try to get content of response as `res.text`. For twitter.com, this will be empty in most (all?) cases. + # Try to get content of response as `res.text`. + # For twitter.com, this will be empty in most (all?) cases. # It is successfully tested with error responses from other domains. - raise Exception(f'Download failed with status "{res.status_code} {res.reason}". Response content: "{res.text}"') + raise Exception(f'Download failed with status "{res.status_code} {res.reason}". ' + f'Response content: "{res.text}"') byte_size_after = int(res.headers['content-length']) - if (byte_size_after != byte_size_before): + if byte_size_after != byte_size_before: # Proceed with the full download tmp_filename = filename+'.tmp' print(f'{pref}Downloading {url}... ', end='\r') @@ -403,30 +722,32 @@ def download_file_if_larger(url, filename, index, count, sleep_time): pixels_before, pixels_after = width_before * height_before, width_after * height_after pixels_percentage_increase = 100.0 * (pixels_after - pixels_before) / pixels_before - if (width_before == -1 and height_before == -1 and width_after == -1 and height_after == -1): + if width_before == -1 and height_before == -1 and width_after == -1 and height_after == -1: # could not check size of both versions, probably a video or unsupported image format os.replace(tmp_filename, filename) bytes_percentage_increase = 100.0 * (byte_size_after - byte_size_before) / byte_size_before logging.info(f'{pref}SUCCESS. New version is {bytes_percentage_increase:3.0f}% ' f'larger in bytes (pixel comparison not possible). {post}') return True, byte_size_after - elif (width_before == -1 or height_before == -1 or width_after == -1 or height_after == -1): + elif width_before == -1 or height_before == -1 or width_after == -1 or height_after == -1: # could not check size of one version, this should not happen (corrupted download?) logging.info(f'{pref}SKIPPED. Pixel size comparison inconclusive: ' f'{width_before}*{height_before}px vs. {width_after}*{height_after}px. {post}') return False, byte_size_after - elif (pixels_after >= pixels_before): + elif pixels_after >= pixels_before: os.replace(tmp_filename, filename) bytes_percentage_increase = 100.0 * (byte_size_after - byte_size_before) / byte_size_before - if (bytes_percentage_increase >= 0): + if bytes_percentage_increase >= 0: logging.info(f'{pref}SUCCESS. New version is {bytes_percentage_increase:3.0f}% larger in bytes ' - f'and {pixels_percentage_increase:3.0f}% larger in pixels. {post}') + f'and {pixels_percentage_increase:3.0f}% larger in pixels. {post}') else: - logging.info(f'{pref}SUCCESS. New version is actually {-bytes_percentage_increase:3.0f}% smaller in bytes ' - f'but {pixels_percentage_increase:3.0f}% larger in pixels. {post}') + logging.info(f'{pref}SUCCESS. New version is actually {-bytes_percentage_increase:3.0f}% ' + f'smaller in bytes but {pixels_percentage_increase:3.0f}% ' + f'larger in pixels. {post}') return True, byte_size_after else: - logging.info(f'{pref}SKIPPED. Online version has {-pixels_percentage_increase:3.0f}% smaller pixel size. {post}') + logging.info(f'{pref}SKIPPED. Online version has {-pixels_percentage_increase:3.0f}% ' + f'smaller pixel size. {post}') return True, byte_size_after else: logging.info(f'{pref}SKIPPED. Online version is same byte size, assuming same content. Not downloaded.') @@ -436,13 +757,14 @@ def download_file_if_larger(url, filename, index, count, sleep_time): return False, 0 -def download_larger_media(media_sources, paths): +def download_larger_media(media_sources: dict, paths: PathConfig): """Uses (filename, URL) tuples in media_sources to download files from remote storage. Aborts downloads if the remote file is the same size or smaller than the existing local version. Retries the failed downloads several times, with increasing pauses between each to avoid being blocked. """ # Log to file as well as the console logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') + mkdirs_for_file(paths.file_download_log) logfile_handler = logging.FileHandler(filename=paths.file_download_log, mode='w') logfile_handler.setLevel(logging.INFO) logging.getLogger().addHandler(logfile_handler) @@ -454,22 +776,36 @@ def download_larger_media(media_sources, paths): while remaining_tries > 0: number_of_files = len(media_sources) success_count = 0 - retries = [] - for index, (local_media_path, media_url) in enumerate(media_sources): + retries = {} + for index, (local_media_path, media_url) in enumerate(media_sources.items()): success, bytes_downloaded = download_file_if_larger(media_url, local_media_path, index + 1, number_of_files, sleep_time) if success: success_count += 1 else: - retries.append((local_media_path, media_url)) + retries[local_media_path] = media_url total_bytes_downloaded += bytes_downloaded + + # show % done and estimated remaining time: + time_elapsed: float = time.time() - start_time + estimated_time_per_file: float = time_elapsed / (index + 1) + + time_remaining_string = format_duration(seconds = (number_of_files - (index + 1)) * estimated_time_per_file) + + if index + 1 == number_of_files: + print(' 100 % done.') + else: + print(f' {(100*(index+1)/number_of_files):.1f} % done, about {time_remaining_string} remaining...') + media_sources = retries remaining_tries -= 1 sleep_time += 2 - logging.info(f'\n{success_count} of {number_of_files} tested media files are known to be the best-quality available.\n') + logging.info(f'\n{success_count} of {number_of_files} tested media files ' + f'are known to be the best-quality available.\n') if len(retries) == 0: break if remaining_tries > 0: - print(f'----------------------\n\nRetrying the ones that failed, with a longer sleep. {remaining_tries} tries remaining.\n') + print(f'----------------------\n\nRetrying the ones that failed, with a longer sleep. ' + f'{remaining_tries} tries remaining.\n') end_time = time.time() logging.info(f'Total downloaded: {total_bytes_downloaded/2**20:.1f}MB = {total_bytes_downloaded/2**30:.2f}GB') @@ -477,89 +813,143 @@ def download_larger_media(media_sources, paths): print(f'Wrote log to {paths.file_download_log}') -def parse_tweets(username, users, html_template, paths): +def parse_tweets(username, users, html_template, paths: PathConfig) -> dict: """Read tweets from paths.files_input_tweets, write to *.md and *.html. Copy the media used to paths.dir_output_media. Collect user_id:user_handle mappings for later use, in 'users'. Returns the mapping from media filename to best-quality URL. - """ - tweets = [] - media_sources = [] + """ + converted_tweets = [] + media_sources = {} counts = defaultdict(int) - known_tweet_ids = set() - - # TODO Load tweets that we saved in an earlier run between pass 2 and 3 + known_tweets = {} + + # TODO If we run this tool multiple times, in `known_tweets` we will have our own tweets as + # well as related tweets by others. With each run, the tweet graph is expanded. We probably do + # not want this. To stop it, implement one of these: + # 1. keep own tweets and other tweets in different dicts + # 2. put them all in one dict, but mark the tweets by others, so that certain steps will ignore them + # 3. use the data that is already present in a tweet to distinguish own tweets from others + + # Load tweets that we saved in an earlier run between pass 2 and 3 + tweet_dict_filename = os.path.join(paths.dir_output_cache, 'known_tweets.json') + if os.path.exists(tweet_dict_filename): + with open(tweet_dict_filename, 'r', encoding='utf8') as f: + known_tweets = json.load(f) - # First pass: collect IDs of known tweets - for tweets_js_filename in input_filenames: - json = read_json_from_js_file(tweets_js_filename) - print (f"Processing {len(json)} tweets in {tweets_js_filename}...") - for tweet in json: - known_tweet_ids.add(collect_tweet_id(tweet)) - - # Second pass: collect IDs of references tweets, excluding known tweets from pass 1 + # Fist pass: Load tweets from all archive files and add them to known_tweets + for tweets_js_filename in paths.files_input_tweets: + json_result = read_json_from_js_file(tweets_js_filename) + for tweet in json_result: + tweet = unwrap_tweet(tweet) + tweet['from_archive'] = True + add_known_tweet(known_tweets, tweet) + tweet_ids_to_download = set() - for tweets_js_filename in input_filenames: - json = read_json_from_js_file(tweets_js_filename) - for tweet in json: - tweet_ids_to_download.update(collect_tweet_references(tweet, known_tweet_ids, counts)) + + # Second pass: Iterate through all those tweets + for tweet in known_tweets.values(): + tweet_ids_to_download.update(collect_tweet_references(tweet, known_tweets, counts)) - # Download referenced tweets + # (Maybe) download referenced tweets referenced_tweets = [] - if (len(tweet_ids_to_download) > 0): + if (len(tweet_ids_to_download) >0): print(f"Found references to {len(tweet_ids_to_download)} tweets which should be downloaded. Breakdown of download reasons:") for reason in ['quote', 'reply', 'retweet', 'media']: print(f" * {counts[reason]} because of {reason}") print(f"There were {counts['known_reply']} references to tweets which are already known so we don't need to download them (not included in the numbers above).") - # TODO maybe ask the user if we should start downloading - # TODO maybe give an estimate of download size and/or time - # TODO maybe let the user choose which of the tweets to download, by selecting a subset of those reasons - requests = import_module('requests') - try: - with requests.Session() as session: - bearer_token = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' - guest_token = get_twitter_api_guest_token(session, bearer_token) - referenced_tweets = get_tweets(session, bearer_token, guest_token, list(tweet_ids_to_download), False) - # TODO Save tweets to a file, merging with contents of existing file if present - # TODO We could download user data together with the tweets, because we will need it anyway. But we might download the data for each user multiple times then. - except Exception as err: - print(f'Failed to download tweets: {err}') + print() + print("Please note that the downloaded tweets will not be included in the generated output yet.") + print("Anyway, we recommend to download the tweets now, just in case Twitter (or its API which") + print("we use), won't be available forever. A future version of this script will be able to") + print("include the downloaded tweets into the output, even if Twitter should not be available then.") + print() + + while (len(tweet_ids_to_download) > 0): + estimated_download_time_seconds = math.ceil(len(tweet_ids_to_download) / 100) * 2 + estimated_download_time_str = format_duration(estimated_download_time_seconds) + if get_consent(f"OK to download {len(tweet_ids_to_download)} tweets from twitter? This would take about {estimated_download_time_str}."): + # TODO maybe give an estimate of download size and/or time + # TODO maybe let the user choose which of the tweets to download, by selecting a subset of those reasons + requests = import_module('requests') + try: + with requests.Session() as session: + bearer_token = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' + guest_token = get_twitter_api_guest_token(session, bearer_token) + # TODO We could download user data together with the tweets, because we will need it anyway. But we might download the data for each user multiple times then. + downloaded_tweets, tweet_ids_to_download = get_tweets(session, bearer_token, guest_token, list(tweet_ids_to_download), False) + + for downloaded_tweet in downloaded_tweets.values(): + downloaded_tweet = unwrap_tweet(downloaded_tweet) + downloaded_tweet['from_api'] = True + downloaded_tweet['download_with_user'] = False + downloaded_tweet['download_with_alt_text'] = True + add_known_tweet(known_tweets, downloaded_tweet) + with open(tweet_dict_filename, "w") as outfile: + json.dump(known_tweets, outfile, indent=2) + print(f"Saved {len(known_tweets)} tweets to '{tweet_dict_filename}'.") + + except Exception as err: + # this code is rather unlikely to be reached, since get_tweets has internal error handling. + print(f'Failed to download tweets: {err}') + + if len(tweet_ids_to_download) > 0: + print("Not all tweets could be downloaded, but you can retry if you want.") + else: + # Don't ask again and again if the user said 'no' + break # Third pass: convert tweets, using the downloaded references from pass 2 - for tweets_js_filename in input_filenames: - json = read_json_from_js_file(tweets_js_filename) - for tweet in json: - tweets.append(convert_tweet(tweet, username, media_sources, users, referenced_tweets, paths)) - - tweets.sort(key=lambda tup: tup[0]) # oldest first + for tweet in known_tweets.values(): + try: + converted_tweets.append(convert_tweet(tweet, username, media_sources, users, referenced_tweets, paths)) + except Exception as err: + print(f"Could not convert tweet {tweet['id_str']} because: {err}") + converted_tweets.sort(key=lambda tup: tup[0]) # oldest first # Group tweets by month grouped_tweets = defaultdict(list) - for timestamp, md, html in tweets: + for timestamp, md, html in converted_tweets: # Use a (markdown) filename that can be imported into Jekyll: YYYY-MM-DD-your-title-here.md dt = datetime.datetime.fromtimestamp(timestamp) - filename = f'{dt.year}-{dt.month:02}-01-Tweet-Archive-{dt.year}-{dt.month:02}' # change to group by day or year or timestamp - grouped_tweets[filename].append((md, html)) + grouped_tweets[(dt.year, dt.month)].append((md, html)) - for filename, content in grouped_tweets.items(): + for (year, month), content in grouped_tweets.items(): # Write into *.md files - md_string = '\n\n----\n\n'.join(md for md, _ in content) - with open(f'{filename}.md', 'w', encoding='utf-8') as f: + md_string = '\n\n----\n\n'.join(md for md, _ in content) + md_path = paths.create_path_for_file_output_tweets(year, month, format="md") + with open_and_mkdirs(md_path) as f: f.write(md_string) # Write into *.html files html_string = '
\n'.join(html for _, html in content) - with open(f'{filename}.html', 'w', encoding='utf-8') as f: + html_path = paths.create_path_for_file_output_tweets(year, month, format="html") + with open_and_mkdirs(html_path) as f: f.write(html_template.format(html_string)) - print(f'Wrote {len(tweets)} tweets to *.md and *.html, with images and video embedded from {paths.dir_output_media}') + print(f'Wrote {len(converted_tweets)} tweets to *.md and *.html, ' + f'with images and video embedded from {paths.dir_output_media}') return media_sources -def parse_followings(users, URL_template_user_id, paths): +def collect_user_ids_from_followings(paths) -> list: + """ + Collect all user ids that appear in the followings archive data. + (For use in bulk online lookup from Twitter.) + """ + # read JSON file from archive + following_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'following.js')) + # collect all user ids in a list + following_ids = [] + for follow in following_json: + if 'following' in follow and 'accountId' in follow['following']: + following_ids.append(follow['following']['accountId']) + return following_ids + + +def parse_followings(users, user_id_url_template, paths: PathConfig): """Parse paths.dir_input_data/following.js, write to paths.file_output_following. - Query Twitter API for the missing user handles, if the user agrees. """ following = [] following_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'following.js')) @@ -567,19 +957,33 @@ def parse_followings(users, URL_template_user_id, paths): for follow in following_json: if 'following' in follow and 'accountId' in follow['following']: following_ids.append(follow['following']['accountId']) - lookup_users(following_ids, users) - for id in following_ids: - handle = users[id].handle if id in users else '~unknown~handle~' - following.append(handle + ' ' + URL_template_user_id.format(id)) + for following_id in following_ids: + handle = users[following_id].handle if following_id in users else '~unknown~handle~' + following.append(handle + ' ' + user_id_url_template.format(following_id)) following.sort() - with open(paths.file_output_following, 'w', encoding='utf8') as f: + following_output_path = paths.create_path_for_file_output_single(format="txt", kind="following") + with open_and_mkdirs(following_output_path) as f: f.write('\n'.join(following)) - print(f"Wrote {len(following)} accounts to {paths.file_output_following}") + print(f"Wrote {len(following)} accounts to {following_output_path}") + + +def collect_user_ids_from_followers(paths) -> list: + """ + Collect all user ids that appear in the followers archive data. + (For use in bulk online lookup from Twitter.) + """ + # read JSON file from archive + follower_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'follower.js')) + # collect all user ids in a list + follower_ids = [] + for follower in follower_json: + if 'follower' in follower and 'accountId' in follower['follower']: + follower_ids.append(follower['follower']['accountId']) + return follower_ids -def parse_followers(users, URL_template_user_id, paths): +def parse_followers(users, user_id_url_template, paths: PathConfig): """Parse paths.dir_input_data/followers.js, write to paths.file_output_followers. - Query Twitter API for the missing user handles, if the user agrees. """ followers = [] follower_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'follower.js')) @@ -587,14 +991,14 @@ def parse_followers(users, URL_template_user_id, paths): for follower in follower_json: if 'follower' in follower and 'accountId' in follower['follower']: follower_ids.append(follower['follower']['accountId']) - lookup_users(follower_ids, users) - for id in follower_ids: - handle = users[id].handle if id in users else '~unknown~handle~' - followers.append(handle + ' ' + URL_template_user_id.format(id)) + for follower_id in follower_ids: + handle = users[follower_id].handle if follower_id in users else '~unknown~handle~' + followers.append(handle + ' ' + user_id_url_template.format(follower_id)) followers.sort() - with open(paths.file_output_followers, 'w', encoding='utf8') as f: + followers_output_path = paths.create_path_for_file_output_single(format="txt", kind="followers") + with open_and_mkdirs(followers_output_path) as f: f.write('\n'.join(followers)) - print(f"Wrote {len(followers)} accounts to {paths.file_output_followers}") + print(f"Wrote {len(followers)} accounts to {followers_output_path}") def chunks(lst: list, n: int): @@ -603,21 +1007,30 @@ def chunks(lst: list, n: int): yield lst[i:i + n] -def parse_direct_messages(username, users, URL_template_user_id, paths): - """Parse paths.dir_input_data/direct-messages.js, write to one markdown file per conversation. - Query Twitter API for the missing user handles, if the user agrees. +def collect_user_ids_from_direct_messages(paths) -> list: + """ + Collect all user ids that appear in the direct messages archive data. + (For use in bulk online lookup from Twitter.) """ - # Scan the DMs for missing user handles + # read JSON file from archive dms_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'direct-messages.js')) - dm_user_ids = set() + # collect all user ids in a set + dms_user_ids = set() for conversation in dms_json: if 'dmConversation' in conversation and 'conversationId' in conversation['dmConversation']: dm_conversation = conversation['dmConversation'] conversation_id = dm_conversation['conversationId'] user1_id, user2_id = conversation_id.split('-') - dm_user_ids.add(user1_id) - dm_user_ids.add(user2_id) - lookup_users(list(dm_user_ids), users) + dms_user_ids.add(user1_id) + dms_user_ids.add(user2_id) + return list(dms_user_ids) + + +def parse_direct_messages(username, users, user_id_url_template, paths: PathConfig): + """Parse paths.dir_input_data/direct-messages.js, write to one markdown file per conversation. + """ + # read JSON file + dms_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'direct-messages.js')) # Parse the DMs and store the messages in a dict conversations_messages = defaultdict(list) @@ -636,22 +1049,87 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): to_id = message_create['recipientId'] body = message_create['text'] # replace t.co URLs with their original versions - if 'urls' in message_create: + if 'urls' in message_create and len(message_create['urls']) > 0: for url in message_create['urls']: if 'url' in url and 'expanded' in url: expanded_url = url['expanded'] body = body.replace(url['url'], expanded_url) + # escape message body for markdown rendering: + body_markdown = escape_markdown(body) + # replace image URLs with image links to local files + if 'mediaUrls' in message_create \ + and len(message_create['mediaUrls']) == 1 \ + and 'urls' in message_create: + original_expanded_url = message_create['urls'][0]['expanded'] + message_id = message_create['id'] + media_hash_and_type = message_create['mediaUrls'][0].split('/')[-1] + media_id = message_create['mediaUrls'][0].split('/')[-2] + archive_media_filename = f'{message_id}-{media_hash_and_type}' + new_url = os.path.join(paths.dir_output_media, archive_media_filename) + archive_media_path = \ + os.path.join(paths.dir_input_data, 'direct_messages_media', archive_media_filename) + if os.path.isfile(archive_media_path): + # found a matching image, use this one + if not os.path.isfile(new_url): + shutil.copy(archive_media_path, new_url) + image_markdown = f'\n![]({new_url})\n' + body_markdown = body_markdown.replace( + escape_markdown(original_expanded_url), image_markdown + ) + + # Save the online location of the best-quality version of this file, + # for later upgrading if wanted + best_quality_url = \ + f'https://ton.twitter.com/i//ton/data/dm/' \ + f'{message_id}/{media_id}/{media_hash_and_type}' + # there is no ':orig' here, the url without any suffix has the original size + + # TODO: a cookie (and a 'Referer: https://twitter.com' header) + # is needed to retrieve it, so the url might be useless anyway... + + # WARNING: Do not uncomment the statement below until the cookie problem is solved! + # media_sources.append( + # ( + # os.path.join(output_media_folder_name, archive_media_filename), + # best_quality_url + # ) + # ) + + else: + archive_media_paths = glob.glob( + os.path.join(paths.dir_input_data, 'direct_messages_media', message_id + '*')) + if len(archive_media_paths) > 0: + for archive_media_path in archive_media_paths: + archive_media_filename = os.path.split(archive_media_path)[-1] + media_url = os.path.join(paths.dir_output_media, archive_media_filename) + if not os.path.isfile(media_url): + shutil.copy(archive_media_path, media_url) + video_markdown = f'\n\n' + body_markdown = body_markdown.replace( + escape_markdown(original_expanded_url), video_markdown + ) + + # TODO: maybe also save the online location of the best-quality version for videos? + # (see above) + + else: + print(f'Warning: missing local file: {archive_media_path}. ' + f'Using original link instead: {original_expanded_url})') + created_at = message_create['createdAt'] # example: 2022-01-27T15:58:52.744Z timestamp = \ int(round(datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp())) - from_handle = users[from_id].handle.replace('_', '\\_') if from_id in users \ - else URL_template_user_id.format(from_id) - to_handle = users[to_id].handle.replace('_', '\\_') if to_id in users \ - else URL_template_user_id.format(to_id) + from_handle = escape_markdown(users[from_id].handle) if from_id in users \ + else user_id_url_template.format(from_id) + to_handle = escape_markdown(users[to_id].handle) if to_id in users \ + else user_id_url_template.format(to_id) - message_markdown = f'\n\n### {from_handle} -> {to_handle}: ' \ - f'({created_at}) ###\n```\n{body}\n```' + # make the body a quote + body_markdown = '> ' + '\n> '.join(body_markdown.splitlines()) + message_markdown = f'{from_handle} -> {to_handle}: ({created_at}) \n\n' \ + f'{body_markdown}' messages.append((timestamp, message_markdown)) # find identifier for the conversation @@ -667,12 +1145,12 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): # sort messages by timestamp messages.sort(key=lambda tup: tup[0]) - other_user_name = users[other_user_id].handle.replace('_', '\\_') if other_user_id in users \ - else URL_template_user_id.format(other_user_id) + other_user_name = escape_markdown(users[other_user_id].handle) if other_user_id in users \ + else user_id_url_template.format(other_user_id) other_user_short_name: str = users[other_user_id].handle if other_user_id in users else other_user_id - escaped_username = username.replace('_', '\\_') + escaped_username = escape_markdown(username) # if there are more than 1000 messages, the conversation was split up in the twitter archive. # following this standard, also split up longer conversations in the output files: @@ -680,27 +1158,26 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): if len(messages) > 1000: for chunk_index, chunk in enumerate(chunks(messages, 1000)): markdown = '' - markdown += f'## Conversation between {escaped_username} and {other_user_name}, ' \ - f'part {chunk_index+1}: ##\n' - markdown += ''.join(md for _, md in chunk) - conversation_output_filename = \ - paths.file_template_dm_output.format(f'{other_user_short_name}_part{chunk_index+1:03}') + markdown += f'### Conversation between {escaped_username} and {other_user_name}, ' \ + f'part {chunk_index+1}: ###\n\n----\n\n' + markdown += '\n\n----\n\n'.join(md for _, md in chunk) + conversation_output_path = paths.create_path_for_file_output_dms(name=other_user_short_name, index=(chunk_index + 1), format="md") # write part to a markdown file - with open(conversation_output_filename, 'w', encoding='utf8') as f: + with open_and_mkdirs(conversation_output_path) as f: f.write(markdown) - print(f'Wrote {len(chunk)} messages to {conversation_output_filename}') + print(f'Wrote {len(chunk)} messages to {conversation_output_path}') num_written_files += 1 else: markdown = '' - markdown += f'## Conversation between {escaped_username} and {other_user_name}: ##\n' - markdown += ''.join(md for _, md in messages) - conversation_output_filename = paths.file_template_dm_output.format(other_user_short_name) + markdown += f'### Conversation between {escaped_username} and {other_user_name}: ###\n\n----\n\n' + markdown += '\n\n----\n\n'.join(md for _, md in messages) + conversation_output_path = paths.create_path_for_file_output_dms(name=other_user_short_name, format="md") - with open(conversation_output_filename, 'w', encoding='utf8') as f: + with open_and_mkdirs(conversation_output_path) as f: f.write(markdown) - print(f'Wrote {len(messages)} messages to {conversation_output_filename}') + print(f'Wrote {len(messages)} messages to {conversation_output_path}') num_written_files += 1 num_written_messages += len(messages) @@ -709,32 +1186,420 @@ def parse_direct_messages(username, users, URL_template_user_id, paths): f"({num_written_messages} total messages) to {num_written_files} markdown files\n") -class PathConfig: - """Helper class containing constants for various directories and files.""" - def __init__(self, dir_archive, dir_output): - self.dir_input_data = os.path.join(dir_archive, 'data') - self.dir_input_media = find_dir_input_media(self.dir_input_data) - self.dir_output_media = os.path.join(dir_output, 'media') - self.file_output_following = os.path.join(dir_output, 'following.txt') - self.file_output_followers = os.path.join(dir_output, 'followers.txt') - self.file_template_dm_output = os.path.join(dir_output, 'DMs-Archive-{}.md') - self.file_account_js = os.path.join(self.dir_input_data, 'account.js') - self.file_download_log = os.path.join(self.dir_output_media, 'download_log.txt') - self.file_tweet_icon = os.path.join(self.dir_output_media, 'tweet.ico') - self.files_input_tweets = find_files_input_tweets(self.dir_input_data) +def make_conversation_name_safe_for_filename(conversation_name: str) -> str: + """ + Remove/replace characters that could be unsafe in filenames + """ + forbidden_chars = \ + ['"', "'", '*', '/', '\\', ':', '<', '>', '?', '|', '!', '@', ';', ',', '=', '.', '\n', '\r', '\t'] + new_conversation_name = '' + for char in conversation_name: + if char in forbidden_chars: + new_conversation_name = new_conversation_name + '_' + elif char.isspace(): + # replace spaces with underscores + new_conversation_name = new_conversation_name + '_' + elif char == 0x7F or (0x1F >= ord(char) >= 0x00): + # 0x00 - 0x1F and 0x7F are also forbidden, just discard them + continue + else: + new_conversation_name = new_conversation_name + char + + return new_conversation_name + + +def find_group_dm_conversation_participant_ids(conversation: dict) -> set: + """ + Find IDs of all participating Users in a group direct message conversation + """ + group_user_ids = set() + if 'dmConversation' in conversation and 'conversationId' in conversation['dmConversation']: + dm_conversation = conversation['dmConversation'] + if 'messages' in dm_conversation: + for message in dm_conversation['messages']: + if 'messageCreate' in message: + group_user_ids.add(message['messageCreate']['senderId']) + elif 'joinConversation' in message: + group_user_ids.add(message['joinConversation']['initiatingUserId']) + for participant_id in message['joinConversation']['participantsSnapshot']: + group_user_ids.add(participant_id) + elif "participantsJoin" in message: + group_user_ids.add(message['participantsJoin']['initiatingUserId']) + for participant_id in message['participantsJoin']['userIds']: + group_user_ids.add(participant_id) + return group_user_ids + + +def collect_user_ids_from_group_direct_messages(paths) -> list: + """ + Collect all user ids that appear in the group direct messages archive data. + (For use in bulk online lookup from Twitter.) + """ + # read JSON file from archive + group_dms_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'direct-messages-group.js')) + # collect all user ids in a set + group_dms_user_ids = set() + for conversation in group_dms_json: + participants = find_group_dm_conversation_participant_ids(conversation) + for participant_id in participants: + group_dms_user_ids.add(participant_id) + return list(group_dms_user_ids) + + +def parse_group_direct_messages(username, users, user_id_url_template, paths): + """Parse data_folder/direct-messages-group.js, write to one markdown file per conversation. + """ + # read JSON file from archive + group_dms_json = read_json_from_js_file(os.path.join(paths.dir_input_data, 'direct-messages-group.js')) + + # Parse the group DMs, store messages and metadata in a dict + group_conversations_messages = defaultdict(list) + group_conversations_metadata = defaultdict(dict) + for conversation in group_dms_json: + if 'dmConversation' in conversation and 'conversationId' in conversation['dmConversation']: + dm_conversation = conversation['dmConversation'] + conversation_id = dm_conversation['conversationId'] + participants = find_group_dm_conversation_participant_ids(conversation) + participant_names = [] + for participant_id in participants: + if participant_id in users: + participant_names.append(users[participant_id].handle) + else: + participant_names.append(user_id_url_template.format(participant_id)) + + # save names in metadata + group_conversations_metadata[conversation_id]['participants'] = participants + group_conversations_metadata[conversation_id]['participant_names'] = participant_names + group_conversations_metadata[conversation_id]['conversation_names'] = [(0, conversation_id)] + group_conversations_metadata[conversation_id]['participant_message_count'] = defaultdict(int) + for participant_id in participants: + # init every participant's message count with 0, so that users with no activity are not ignored + group_conversations_metadata[conversation_id]['participant_message_count'][participant_id] = 0 + messages = [] + if 'messages' in dm_conversation: + for message in dm_conversation['messages']: + if 'messageCreate' in message: + message_create = message['messageCreate'] + if all(tag in message_create for tag in ['senderId', 'text', 'createdAt']): + from_id = message_create['senderId'] + # count how many messages this user has sent to the group + group_conversations_metadata[conversation_id]['participant_message_count'][from_id] += 1 + body = message_create['text'] + # replace t.co URLs with their original versions + if 'urls' in message_create: + for url in message_create['urls']: + if 'url' in url and 'expanded' in url: + expanded_url = url['expanded'] + body = body.replace(url['url'], expanded_url) + # escape message body for markdown rendering: + body_markdown = escape_markdown(body) + # replace image URLs with image links to local files + if 'mediaUrls' in message_create \ + and len(message_create['mediaUrls']) == 1 \ + and 'urls' in message_create: + original_expanded_url = message_create['urls'][0]['expanded'] + message_id = message_create['id'] + media_hash_and_type = message_create['mediaUrls'][0].split('/')[-1] + media_id = message_create['mediaUrls'][0].split('/')[-2] + archive_media_filename = f'{message_id}-{media_hash_and_type}' + new_url = os.path.join(paths.dir_output_media, archive_media_filename) + archive_media_path = \ + os.path.join(paths.dir_input_data, 'direct_messages_group_media', + archive_media_filename) + if os.path.isfile(archive_media_path): + # found a matching image, use this one + if not os.path.isfile(new_url): + shutil.copy(archive_media_path, new_url) + image_markdown = f'\n![]({new_url})\n' + body_markdown = body_markdown.replace( + escape_markdown(original_expanded_url), image_markdown + ) + + # Save the online location of the best-quality version of this file, + # for later upgrading if wanted + best_quality_url = \ + f'https://ton.twitter.com/i//ton/data/dm/' \ + f'{message_id}/{media_id}/{media_hash_and_type}' + # there is no ':orig' here, the url without any suffix has the original size + + # TODO: a cookie (and a 'Referer: https://twitter.com' header) + # is needed to retrieve it, so the url might be useless anyway... + + # WARNING: Do not uncomment the statement below until the cookie problem is solved! + # media_sources.append( + # ( + # os.path.join(output_media_folder_name, archive_media_filename), + # best_quality_url + # ) + # ) + + else: + archive_media_paths = glob.glob( + os.path.join(paths.dir_input_data, 'direct_messages_group_media', + message_id + '*')) + if len(archive_media_paths) > 0: + for archive_media_path in archive_media_paths: + archive_media_filename = os.path.split(archive_media_path)[-1] + media_url = os.path.join(paths.dir_output_media, + archive_media_filename) + if not os.path.isfile(media_url): + shutil.copy(archive_media_path, media_url) + video_markdown = f'\n\n' + body_markdown = body_markdown.replace( + escape_markdown(original_expanded_url), video_markdown + ) + + # TODO: maybe also save the online location of the best-quality version for videos? + # (see above) + + else: + print(f'Warning: missing local file: {archive_media_path}. ' + f'Using original link instead: {original_expanded_url})') + created_at = message_create['createdAt'] # example: 2022-01-27T15:58:52.744Z + timestamp = int(round( + datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() + )) + from_handle = escape_markdown(users[from_id].handle) if from_id in users \ + else user_id_url_template.format(from_id) + # make the body a quote + body_markdown = '> ' + '\n> '.join(body_markdown.splitlines()) + message_markdown = f'{from_handle}: ({created_at})\n\n' \ + f'{body_markdown}' + messages.append((timestamp, message_markdown)) + elif "conversationNameUpdate" in message: + conversation_name_update = message['conversationNameUpdate'] + if all(tag in conversation_name_update for tag in ['initiatingUserId', 'name', 'createdAt']): + from_id = conversation_name_update['initiatingUserId'] + body_markdown = f"_changed group name to: {escape_markdown(conversation_name_update['name'])}_" + created_at = conversation_name_update['createdAt'] # example: 2022-01-27T15:58:52.744Z + timestamp = int(round( + datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() + )) + from_handle = escape_markdown(users[from_id].handle) if from_id in users \ + else user_id_url_template.format(from_id) + message_markdown = f'{from_handle}: ({created_at})\n\n{body_markdown}' + messages.append((timestamp, message_markdown)) + # save metadata about name change: + group_conversations_metadata[conversation_id]['conversation_names'].append( + (timestamp, conversation_name_update['name']) + ) + elif "joinConversation" in message: + join_conversation = message['joinConversation'] + if all(tag in join_conversation for tag in ['initiatingUserId', 'createdAt']): + from_id = join_conversation['initiatingUserId'] + created_at = join_conversation['createdAt'] # example: 2022-01-27T15:58:52.744Z + timestamp = int(round( + datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() + )) + from_handle = escape_markdown(users[from_id].handle) if from_id in users \ + else user_id_url_template.format(from_id) + escaped_username = escape_markdown(username) + body_markdown = f'_{from_handle} added {escaped_username} to the group_' + message_markdown = f'{from_handle}: ({created_at})\n\n{body_markdown}' + messages.append((timestamp, message_markdown)) + elif "participantsJoin" in message: + participants_join = message['participantsJoin'] + if all(tag in participants_join for tag in ['initiatingUserId', 'userIds', 'createdAt']): + from_id = participants_join['initiatingUserId'] + created_at = participants_join['createdAt'] # example: 2022-01-27T15:58:52.744Z + timestamp = int(round( + datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() + )) + from_handle = escape_markdown(users[from_id].handle) if from_id in users \ + else user_id_url_template.format(from_id) + joined_ids = participants_join['userIds'] + joined_handles = [escape_markdown(users[joined_id].handle) if joined_id in users + else user_id_url_template.format(joined_id) for joined_id in joined_ids] + name_list = ', '.join(joined_handles[:-1]) + \ + (f' and {joined_handles[-1]}' if len(joined_handles) > 1 else + joined_handles[0]) + body_markdown = f'_{from_handle} added {name_list} to the group_' + message_markdown = f'{from_handle}: ({created_at})\n\n{body_markdown}' + messages.append((timestamp, message_markdown)) + elif "participantsLeave" in message: + participants_leave = message['participantsLeave'] + if all(tag in participants_leave for tag in ['userIds', 'createdAt']): + created_at = participants_leave['createdAt'] # example: 2022-01-27T15:58:52.744Z + timestamp = int(round( + datetime.datetime.strptime(created_at, '%Y-%m-%dT%X.%fZ').timestamp() + )) + left_ids = participants_leave['userIds'] + left_handles = [escape_markdown(users[left_id].handle) if left_id in users + else user_id_url_template.format(left_id) for left_id in left_ids] + name_list = ', '.join(left_handles[:-1]) + \ + (f' and {left_handles[-1]}' if len(left_handles) > 1 else + left_handles[0]) + body_markdown = f'_{name_list} left the group_' + message_markdown = f'{name_list}: ({created_at})\n\n{body_markdown}' + messages.append((timestamp, message_markdown)) + + # collect messages per conversation in group_conversations_messages dict + group_conversations_messages[conversation_id].extend(messages) + + # output as one file per conversation (or part of long conversation) + num_written_messages = 0 + num_written_files = 0 + for conversation_id, messages in group_conversations_messages.items(): + # sort messages by timestamp + messages.sort(key=lambda tup: tup[0]) + # create conversation name for use in filename: + # first, try to find an official name in the parsed conversation data + + # Not-so-fun fact: + # If the name was set before the archive's owner joined the group, the name is not included + # in the archive data and can't be found anywhere (except by looking it up from twitter, + # and that would probably need a cookie). So there are many groups that do actually have a name, + # but it can't be used here because we don't know it. + + group_conversations_metadata[conversation_id]['conversation_names'].sort(key=lambda tup: tup[0], reverse=True) + official_name = group_conversations_metadata[conversation_id]['conversation_names'][0][1] + safe_group_name = make_conversation_name_safe_for_filename(official_name) + if len(safe_group_name) < 2: + # discard name if it's too short (because of collision risk) + group_name = conversation_id + else: + group_name = safe_group_name + + if group_name == conversation_id: + # try to make a nice list of participant handles for the conversation name + handles = [] + for participant_id, message_count in \ + group_conversations_metadata[conversation_id]['participant_message_count'].items(): + if participant_id in users: + participant_handle = users[participant_id].handle + if participant_handle != username: + handles.append((participant_handle, message_count)) + # sort alphabetically by handle first, for a more deterministic order + handles.sort(key=lambda tup: tup[0]) + # sort so that the most active users are at the start of the list + handles.sort(key=lambda tup: tup[1], reverse=True) + if len(handles) == 1: + group_name = \ + f'{handles[0][0]}_and_{len(group_conversations_metadata[conversation_id]["participants"]) - 1}_more' + elif len(handles) == 2 and len(group_conversations_metadata[conversation_id]["participants"]) == 3: + group_name = f'{handles[0][0]}_and_{handles[1][0]}_and_{username}' + elif len(handles) >= 2: + group_name = \ + f'{handles[0][0]}_and_{handles[1][0]}_and' \ + f'_{len(group_conversations_metadata[conversation_id]["participants"]) - 2}_more' + else: + # just use the conversation id + group_name = conversation_id + + # create a list of names of the form '@name1, @name2 and @name3' + # to use as a headline in the output file + escaped_participant_names = [ + escape_markdown(participant_name) + for participant_name in group_conversations_metadata[conversation_id]['participant_names'] + ] + name_list = ', '.join(escaped_participant_names[:-1]) + \ + (f' and {escaped_participant_names[-1]}' + if len(escaped_participant_names) > 1 + else escaped_participant_names[0]) + + if len(messages) > 1000: + for chunk_index, chunk in enumerate(chunks(messages, 1000)): + markdown = '' + markdown += f'## {official_name} ##\n\n' + markdown += f'### Group conversation between {name_list}, part {chunk_index + 1}: ###\n\n----\n\n' + markdown += '\n\n----\n\n'.join(md for _, md in chunk) + conversation_output_filename = paths.create_path_for_file_output_dms( + name=group_name, format="md", kind="DMs-Group", index=chunk_index + 1 + ) + + # write part to a markdown file + with open_and_mkdirs(conversation_output_filename) as f: + f.write(markdown) + print(f'Wrote {len(chunk)} messages to {conversation_output_filename}') + num_written_files += 1 + else: + markdown = '' + markdown += f'## {official_name} ##\n\n' + markdown += f'### Group conversation between {name_list}: ###\n\n----\n\n' + markdown += '\n\n----\n\n'.join(md for _, md in messages) + conversation_output_filename = \ + paths.create_path_for_file_output_dms(name=group_name, format="md", kind="DMs-Group") + + with open_and_mkdirs(conversation_output_filename) as f: + f.write(markdown) + print(f'Wrote {len(messages)} messages to {conversation_output_filename}') + num_written_files += 1 + + num_written_messages += len(messages) + + print(f"\nWrote {len(group_conversations_messages)} direct message group conversations " + f"({num_written_messages} total messages) to {num_written_files} markdown files") + + +def migrate_old_output(paths: PathConfig): + """If present, moves media and cache files from the archive root to the new locations in + `paths.dir_output_media` and `paths.dir_output_cache`. Then deletes old output files + (md, html, txt) from the archive root, if the user consents.""" + + # Create new folders, so we can potentially use them to move files there + os.makedirs(paths.dir_output_media, exist_ok=True) + os.makedirs(paths.dir_output_cache, exist_ok=True) + + # Move files that we can re-use: + if os.path.exists(os.path.join(paths.dir_archive, "media")): + files_to_move = glob.glob(os.path.join(paths.dir_archive, "media", "*")) + if len(files_to_move) > 0: + print(f"Moving {len(files_to_move)} files from 'media' to '{paths.dir_output_media}'") + for file_path_to_move in files_to_move: + file_name_to_move = os.path.split(file_path_to_move)[1] + os.rename(file_path_to_move, os.path.join(paths.dir_output_media, file_name_to_move)) + os.rmdir(os.path.join(paths.dir_archive, "media")) + + known_tweets_old_path = os.path.join(paths.dir_archive, "known_tweets.json") + known_tweets_new_path = os.path.join(paths.dir_output_cache, "known_tweets.json") + if os.path.exists(known_tweets_old_path): + os.rename(known_tweets_old_path, known_tweets_new_path) + + # Delete files that would be overwritten anyway (if user consents): + output_globs = [ + "TweetArchive.html", + "*Tweet-Archive*.html", + "*Tweet-Archive*.md", + "DMs-Archive-*.html", + "DMs-Archive-*.md", + "DMs-Group-Archive-*.html", + "DMs-Group-Archive-*.md", + "followers.txt", + "following.txt", + ] + files_to_delete = [] + + for output_glob in output_globs: + files_to_delete += glob.glob(os.path.join(paths.dir_archive, output_glob)) + + # TODO maybe remove those files only after the new ones have been generated? This way, the user would never + # end up with less output than before. On the other hand, they might end up with old *and* new versions + # of the output, if the script crashes before it reaches the code to delete the old version. + if len(files_to_delete) > 0: + print(f"\nThere are {len(files_to_delete)} files in the root of the archive,") + print("which were probably generated from an older version of this script.") + print("Since then, the directory layout of twitter-archive-parser has changed") + print("and these files are generated into the sub-directory 'parser-output' or") + print("various sub-sub-directories therein. These are the affected files:\n") + + for file_to_delete in files_to_delete: + print(file_to_delete) + + print() + if get_consent('OK to delete these files? (If the the directory layout would not have changed, they would be overwritten anyway)'): + for file_to_delete in files_to_delete: + os.remove(file_to_delete) + print(f"Files have been deleted. New versions of these files will be generated into 'parser-output' soon.") def main(): - paths = PathConfig(dir_archive='.', dir_output='.') + paths = PathConfig(dir_archive='.') - # Extract the username from data/account.js - if not os.path.isfile(paths.file_account_js): - print(f'Error: Failed to load {paths.file_account_js}. Start this script in the root folder of your Twitter archive.') - exit() + # Extract the archive owner's username from data/account.js username = extract_username(paths) - # URL config - URL_template_user_id = 'https://twitter.com/i/user/{}' + user_id_url_template = 'https://twitter.com/i/user/{}' html_template = """\ @@ -756,26 +1621,79 @@ def main(): users = {} + migrate_old_output(paths) + # Make a folder to copy the images and videos into. - os.makedirs(paths.dir_output_media, exist_ok = True) + os.makedirs(paths.dir_output_media, exist_ok=True) if not os.path.isfile(paths.file_tweet_icon): - shutil.copy('assets/images/favicon.ico', paths.file_tweet_icon); + shutil.copy('assets/images/favicon.ico', paths.file_tweet_icon) media_sources = parse_tweets(username, users, html_template, paths) - parse_followings(users, URL_template_user_id, paths) - parse_followers(users, URL_template_user_id, paths) - parse_direct_messages(username, users, URL_template_user_id, paths) + + following_ids = collect_user_ids_from_followings(paths) + print(f'found {len(following_ids)} user IDs in followings.') + follower_ids = collect_user_ids_from_followers(paths) + print(f'found {len(follower_ids)} user IDs in followers.') + dms_user_ids = collect_user_ids_from_direct_messages(paths) + print(f'found {len(dms_user_ids)} user IDs in direct messages.') + group_dms_user_ids = collect_user_ids_from_group_direct_messages(paths) + print(f'found {len(group_dms_user_ids)} user IDs in group direct messages.') + + # bulk lookup for user handles from followers, followings, direct messages and group direct messages + collected_user_ids_without_followers = list( + set(following_ids).union(set(dms_user_ids)).union(set(group_dms_user_ids)) + ) + collected_user_ids_only_in_followers: set = set(follower_ids).difference(set(collected_user_ids_without_followers)) + collected_user_ids: list = list(set(collected_user_ids_without_followers) + .union(collected_user_ids_only_in_followers)) + + print(f'\nfound {len(collected_user_ids)} user IDs overall.') + + # give the user a choice if followers should be included in the lookup + # (but only in case they make up a large amount): + unknown_collected_user_ids: set = set(collected_user_ids).difference(users.keys()) + unknown_follower_user_ids: set = unknown_collected_user_ids.intersection(collected_user_ids_only_in_followers) + if len(unknown_follower_user_ids) > 5000: + # Account metadata observed at ~2.1KB on average. + estimated_follower_lookup_size = int(2.1 * len(unknown_follower_user_ids)) + # we can look up at least 3000 users per minute. + estimated_max_follower_lookup_time_in_minutes = len(unknown_follower_user_ids) / 3000 + print( + f'For some user IDs, the @handle is not included in the archive data. ' + f'Unknown user handles can be looked up online.' + f'{len(unknown_follower_user_ids)} of {len(unknown_collected_user_ids)} total ' + f'user IDs with unknown handles are from your followers. Online lookup would be ' + f'about {estimated_follower_lookup_size:,} KB smaller and up to ' + f'{estimated_max_follower_lookup_time_in_minutes:.1f} minutes faster without them.\n' + ) + + if not get_consent(f'Do you want to include handles of your followers ' + f'in the online lookup of user handles anyway?', default_to_yes=True): + collected_user_ids = collected_user_ids_without_followers + + lookup_users(collected_user_ids, users) + + parse_followings(users, user_id_url_template, paths) + parse_followers(users, user_id_url_template, paths) + parse_direct_messages(username, users, user_id_url_template, paths) + parse_group_direct_messages(username, users, user_id_url_template, paths) # Download larger images, if the user agrees - print(f"\nThe archive doesn't contain the original-size images. We can attempt to download them from twimg.com.") - print(f'Please be aware that this script may download a lot of data, which will cost you money if you are') - print(f'paying for bandwidth. Please be aware that the servers might block these requests if they are too') - print(f'frequent. This script may not work if your account is protected. You may want to set it to public') - print(f'before starting the download.') - user_input = input('\nOK to start downloading? [y/n]') - if user_input.lower() in ('y', 'yes'): - download_larger_media(media_sources, paths) - print('In case you set your account to public before initiating the download, do not forget to protect it again.') + if len(media_sources) > 0: + print(f"\nThe archive doesn't contain the original-size images. We can attempt to download them from twimg.com.") + print(f'Please be aware that this script may download a lot of data, which will cost you money if you are') + print(f'paying for bandwidth. Please be aware that the servers might block these requests if they are too') + print(f'frequent. This script may not work if your account is protected. You may want to set it to public') + print(f'before starting the download.\n') + + estimated_download_time_str = format_duration(len(media_sources) * 0.4) + + if get_consent(f'OK to start downloading {len(media_sources)} media files? ' + f'This will take at least {estimated_download_time_str}.'): + + download_larger_media(media_sources, paths) + print('In case you set your account to public before initiating the download, ' + 'do not forget to protect it again.') if __name__ == "__main__":