From e475813228e1a26a5373d9af57cb9a25207a7bfc Mon Sep 17 00:00:00 2001 From: Sarabjit Dhiman Date: Thu, 26 Sep 2024 17:59:41 +0530 Subject: [PATCH] Bump Version to 2.0.0 (#53, #56, #61) Move configurations to TweeterPy constructor\nAdd Proxy support\nOption to modify log_level --- README.md | 22 ++--------- docs/config.md | 78 --------------------------------------- quickstart.py | 7 ++-- setup.py | 2 +- tweeterpy/api_util.py | 5 +-- tweeterpy/config.py | 76 -------------------------------------- tweeterpy/constants.py | 53 ++++++++++++++++++++++++++ tweeterpy/logging_util.py | 8 ++-- tweeterpy/request_util.py | 73 +++++++++++++++--------------------- tweeterpy/session_util.py | 14 +++---- tweeterpy/tweeterpy.py | 54 ++++++++++++++------------- tweeterpy/util.py | 7 ++-- 12 files changed, 135 insertions(+), 264 deletions(-) delete mode 100644 docs/config.md delete mode 100644 tweeterpy/config.py diff --git a/README.md b/README.md index 0ce416d..54b7b96 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,9 @@ OR ```python from twitter import TweeterPy - -TweeterPy() +# proxy = {'http': 'proxy_here', 'https': 'proxy_here'} +proxy = None +TweeterPy(proxies=proxy, log_level="INFO") ``` > ### Example - Get User ID of a User. @@ -53,23 +54,6 @@ Check out step by step guide. [Documentation](docs/docs.md) -## Configuration - -> ### Example - Config Usage - -```python -from tweeterpy import config - -config.PROXY = {"http":"127.0.0.1","https":"127.0.0.1"} -config.TIMEOUT = 10 -config.UPDATE_API = False - -``` - -Check out configuration docs for the available settings. - -[Configurations](docs/config.md) - ## Features - Extracts Tweets diff --git a/docs/config.md b/docs/config.md deleted file mode 100644 index c4cd740..0000000 --- a/docs/config.md +++ /dev/null @@ -1,78 +0,0 @@ -

Configuration

- -# Importing - -```python -from tweeterpy import config -``` - -> ### Example - Config Usage - -```python -from tweeterpy import TweeterPy -from tweeterpy import config - -config.PROXY = {"http":"127.0.0.1","https":"127.0.0.1"} -config.TIMEOUT = 10 - -twitter = TweeterPy() - -print(twitter.get_user_id('elonmusk')) - -``` - -## Retries Limit - -```python -# Maximun number of retries for each request -config.MAX_RETRIES = 3 -``` - -## Request Timeout - -```python -# request timeout - in seconds -config.TIMEOUT = 5 -``` - -## Using Proxies - -```python -# Example {"http":"proxy_here","https":"proxy_here"} Accepts python dictionary. -config.PROXY = None -``` - -## Sessions Directory - -```python -# Directory path/name to save and load logged in sessions/cookies. Default path is current directory. i.e. current_path/Twitter Saved Sessions -config.SESSION_DIRECTORY = "Twitter Saved Sessions" -``` - -## Logs - -```python -# File name to save logs. -LOG_FILE_NAME = "tweeterpy.log" - -# Logging level : "DEBUG","INFO","WARNING","ERROR","CRITICAL" -# If None, "INFO" will be used for Stream/Console logs and "DEBUG" will be used for file logs. -# LOG_LEVEL = "INFO" -LOG_LEVEL = None - -# Disable logs for imported modules/libraries only. -DISABLE_EXTERNAL_LOGS = False - -# Disable logs completely. (It sets logging level to "ERROR".) -DISABLE_LOGS = False - -# Log Configuration. Set Custom Log configuration in dict format. -LOGGING_CONFIG = {} -``` - -## API Updates - -```python -# Disable/Enable Api Update which occurs at the startup Initialization. -UPDATE_API = True -``` diff --git a/quickstart.py b/quickstart.py index 3011e86..9ba2581 100644 --- a/quickstart.py +++ b/quickstart.py @@ -1,12 +1,11 @@ from tweeterpy import TweeterPy -from tweeterpy import config from tweeterpy.util import find_nested_key def main(): - # config.TIMEOUT = 5 - # config.PROXY = {'http': 'proxy_here', 'https': 'proxy_here'} - twitter = TweeterPy() + # proxy = {'http': 'proxy_here', 'https': 'proxy_here'} + proxy = None + twitter = TweeterPy(proxies=proxy, log_level="INFO") print(twitter.get_user_id('elonmusk')) print(twitter.get_user_info('elonmusk')) # print(twitter.get_user_data('elonmusk')) diff --git a/setup.py b/setup.py index a20b212..37f5d00 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -VERSION = "1.2.4" +VERSION = "2.0.0" SHORT_DESCRIPTION = "TweeterPy is a python library to extract data from Twitter. TweeterPy API lets you scrape data from a user's profile like username, userid, bio, followers/followings list, profile media, tweets, etc." with open("requirements.txt") as file: diff --git a/tweeterpy/api_util.py b/tweeterpy/api_util.py index 652ea2d..35532f3 100644 --- a/tweeterpy/api_util.py +++ b/tweeterpy/api_util.py @@ -4,11 +4,10 @@ import tempfile import demjson3 import logging.config -from tweeterpy import config from tweeterpy.request_util import RequestClient -from tweeterpy.constants import Path, FeatureSwitch, API_TMP_FILE +from tweeterpy.constants import Path, FeatureSwitch, API_TMP_FILE, LOGGING_CONFIG -logging.config.dictConfig(config.LOGGING_CONFIG) +logging.config.dictConfig(LOGGING_CONFIG) logger = logging.getLogger(__name__) dataset_regex = re.compile(r'''exports\s*=\s*{(.*?)},''', re.VERBOSE) diff --git a/tweeterpy/config.py b/tweeterpy/config.py deleted file mode 100644 index e787f8d..0000000 --- a/tweeterpy/config.py +++ /dev/null @@ -1,76 +0,0 @@ -# Configuration File -_RATE_LIMIT_STATS = None # Used to keep a track of api limits. DON'T CHANGE IT - -_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36' - -# Maximun number of retries for each request -MAX_RETRIES = 3 - -# request timeout - in seconds -TIMEOUT = 5 - -# Example {"http":"proxy_here","https":"proxy_here"} Accepts python dictionary. -PROXY = None - -# Directory path/name to save and load logged in sessions/cookies. Default path is current directory. i.e. current_path/Twitter Saved Sessions -SESSION_DIRECTORY = "Twitter Saved Sessions" - -# File name to save logs. -LOG_FILE_NAME = "tweeterpy.log" - -# Logging level : "DEBUG","INFO","WARNING","ERROR","CRITICAL" -# If None, "INFO" will be used for Stream/Console logs and "DEBUG" will be used for file logs. -# LOG_LEVEL = "INFO" -LOG_LEVEL = None - -# Disable logs for imported modules/libraries only. -DISABLE_EXTERNAL_LOGS = False - -# Disable logs completely. (It sets logging level to "ERROR".) -DISABLE_LOGS = False - -# Log Configuration. -LOGGING_CONFIG = { - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'standard': { - 'format': '%(asctime)s [%(levelname)s] [Line No. %(lineno)d] %(name)s : %(funcName)s :: %(message)s' - }, - 'custom': { - # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s' - 'class': 'tweeterpy.logging_util.CustomFormatter', - } - }, - 'handlers': { - 'stream': { - 'level': LOG_LEVEL or 'INFO', - 'formatter': 'custom', - 'class': 'logging.StreamHandler', - 'stream': 'ext://sys.stdout' - }, - 'file': { - 'level': LOG_LEVEL or 'DEBUG', - 'formatter': 'standard', - 'class': 'logging.FileHandler', - 'filename': LOG_FILE_NAME, - "encoding": "utf-8" - } - }, - 'loggers': { - '': { # root logger - 'handlers': ['stream', 'file'], - 'level': LOG_LEVEL or 'DEBUG' - }, - '__main__': { # if __name__ == '__main__' - 'handlers': ['stream', 'file'], - 'level': LOG_LEVEL or 'DEBUG', - } - } -} - -# Disable/Enable Api Update which occurs at the startup Initialization. -UPDATE_API = True - -if __name__ == "__main__": - pass diff --git a/tweeterpy/constants.py b/tweeterpy/constants.py index 66b6f0d..92995f3 100644 --- a/tweeterpy/constants.py +++ b/tweeterpy/constants.py @@ -4,9 +4,62 @@ PUBLIC_TOKEN = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36' + # Filename to store api data/endpoints as a backup. API_TMP_FILE = "tweeterpy_api.json" +# Directory path/name to save and load logged in sessions/cookies. Default path is current directory. i.e. current_path/Twitter Saved Sessions +DEFAULT_SESSION_DIRECTORY = "Twitter Saved Sessions" + +# File name to save logs. +LOG_FILE_NAME = "tweeterpy.log" + +# Logging level : "DEBUG","INFO","WARNING","ERROR","CRITICAL" +# If None, "INFO" will be used for Stream/Console logs and "DEBUG" will be used for file logs. +# LOG_LEVEL = "INFO" +LOG_LEVEL = "INFO" + +# Log Configuration. +LOGGING_CONFIG = { + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'standard': { + 'format': '%(asctime)s [%(levelname)s] [Line No. %(lineno)d] %(name)s : %(funcName)s :: %(message)s' + }, + 'custom': { + # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s' + 'class': 'tweeterpy.logging_util.CustomFormatter', + } + }, + 'handlers': { + 'stream': { + 'level': LOG_LEVEL, + 'formatter': 'custom', + 'class': 'logging.StreamHandler', + 'stream': 'ext://sys.stdout' + }, + 'file': { + 'level': 'DEBUG', + 'formatter': 'standard', + 'class': 'logging.FileHandler', + 'filename': LOG_FILE_NAME, + "encoding": "utf-8" + } + }, + 'loggers': { + '': { # root logger + 'handlers': ['stream', 'file'], + 'level': 'DEBUG' + }, + '__main__': { # if __name__ == '__main__' + 'handlers': ['stream', 'file'], + 'level': 'DEBUG', + } + } +} + class Color: BLACK = "\033[0;30m" diff --git a/tweeterpy/logging_util.py b/tweeterpy/logging_util.py index 6719622..153ebd6 100644 --- a/tweeterpy/logging_util.py +++ b/tweeterpy/logging_util.py @@ -1,5 +1,4 @@ import logging -from tweeterpy import config from tweeterpy.constants import Color @@ -23,6 +22,8 @@ def format(self, record): def set_log_level(log_level=None, return_loggers=False, external_only=False): + if log_level and log_level not in logging._levelToName and log_level not in logging._levelToName.values(): + raise Exception("Invalid Log Level") if log_level is None: log_level = logging.ERROR all_loggers = {} @@ -44,9 +45,8 @@ def wrapper(*args, **kwargs): except Exception as error: raise error finally: - if not config.DISABLE_LOGS: - [logging.getLogger(current_logger).setLevel(all_loggers.get( - current_logger)) for current_logger in logging.root.manager.loggerDict.keys() if current_logger in list(all_loggers.keys())] + [logging.getLogger(current_logger).setLevel(all_loggers.get(current_logger)) + for current_logger in logging.root.manager.loggerDict.keys() if current_logger in list(all_loggers.keys())] return returned_output return wrapper diff --git a/tweeterpy/request_util.py b/tweeterpy/request_util.py index 85df6ec..10fffb9 100644 --- a/tweeterpy/request_util.py +++ b/tweeterpy/request_util.py @@ -2,12 +2,11 @@ import requests import logging.config from tweeterpy import util -from tweeterpy import config from urllib.parse import urlparse from tweeterpy.tid import ClientTransaction -from requests.exceptions import ProxyError, InvalidProxyURL +from tweeterpy.constants import LOGGING_CONFIG -logging.config.dictConfig(config.LOGGING_CONFIG) +logging.config.dictConfig(LOGGING_CONFIG) logger = logging.getLogger(__name__) @@ -16,13 +15,9 @@ def __init__(self, session: requests.Session): self.session = session self.client_transaction = None - def request(self, url, method=None, max_retries=None, timeout=None, skip_error_checking=False, **kwargs): + def request(self, url, method=None, skip_error_checking=False, **kwargs): if method is None: method = "GET" - if max_retries is None: - max_retries = config.MAX_RETRIES or 3 - if timeout is None: - timeout = config.TIMEOUT or 30 tid = None logger.debug(f"{locals()}") headers = kwargs.pop("headers", {}) @@ -30,41 +25,33 @@ def request(self, url, method=None, max_retries=None, timeout=None, skip_error_c tid = self.client_transaction.generate_transaction_id( method=method, path=urlparse(url).path) headers["X-Client-Transaction-Id"] = tid - for retry_count, _ in enumerate(range(max_retries), start=1): - response_text, api_limit_stats = "", {} - try: - response = self.session.request( - method, url, headers=headers, timeout=timeout, **kwargs) - api_limit_stats = util.check_api_rate_limits(response) or {} - if "json" in response.headers.get("Content-Type", ""): - response = response.json() - if api_limit_stats: - config._RATE_LIMIT_STATS = api_limit_stats - response.update({"api_rate_limit": api_limit_stats}) - if skip_error_checking: - return response - return util.check_for_errors(response) - soup = bs4.BeautifulSoup(response.content, "lxml") - response_text = "\n".join( - [line.strip() for line in soup.text.split("\n") if line.strip()]) - response.raise_for_status() - return soup - except KeyboardInterrupt: - logger.warn("Keyboard Interruption...") - return - except (ProxyError, InvalidProxyURL) as proxy_error: - logger.error(f"{proxy_error}") - if retry_count >= max_retries: - raise proxy_error - except Exception as error: - logger.debug(f"Retry No. ==> {retry_count}") - if retry_count >= max_retries: - logger.exception(f"{error}\n{response_text}\n") - if api_limit_stats.get('rate_limit_exhausted'): - logger.error( - f"Rate Limit Exceeded => {api_limit_stats}") - raise util.RateLimitError('API Rate Limit Exceeded.') - raise error + + response_text, api_limit_stats = "", {} + try: + response = self.session.request( + method, url, headers=headers, **kwargs) + api_limit_stats = util.check_api_rate_limits(response) or {} + if "json" in response.headers.get("Content-Type", ""): + response = response.json() + if api_limit_stats: + response.update({"api_rate_limit": api_limit_stats}) + if skip_error_checking: + return response + return util.check_for_errors(response) + soup = bs4.BeautifulSoup(response.content, "lxml") + response_text = "\n".join( + [line.strip() for line in soup.text.split("\n") if line.strip()]) + response.raise_for_status() + return soup + except KeyboardInterrupt: + logger.warn("Keyboard Interruption...") + return + except Exception as error: + logger.exception(f"{error}\n{response_text}\n") + if api_limit_stats.get('rate_limit_exhausted'): + logger.error(f"Rate Limit Exceeded => {api_limit_stats}") + raise util.RateLimitError('API Rate Limit Exceeded.') + raise error if __name__ == '__main__': diff --git a/tweeterpy/session_util.py b/tweeterpy/session_util.py index 43834a6..ed2f236 100644 --- a/tweeterpy/session_util.py +++ b/tweeterpy/session_util.py @@ -2,15 +2,15 @@ import pickle import requests import logging.config -from tweeterpy import config +from tweeterpy.constants import DEFAULT_SESSION_DIRECTORY, LOGGING_CONFIG -logging.config.dictConfig(config.LOGGING_CONFIG) +logging.config.dictConfig(LOGGING_CONFIG) logger = logging.getLogger(__name__) def _create_session_directory(directory_path=None): if directory_path is None: - directory_path = config.SESSION_DIRECTORY or os.getcwd() + directory_path = DEFAULT_SESSION_DIRECTORY directory_path = os.path.realpath(os.path.expanduser(directory_path)) os.makedirs(directory_path, exist_ok=True) @@ -51,15 +51,15 @@ def save_session(filename=None, path=None, session=None): return file_path -def load_session(file_path=None, session=None): +def load_session(path=None, session=None): if session is None: raise NameError("name 'session' is not defined.") if not isinstance(session, requests.Session): raise TypeError( f"Invalid session type. {session} is not a requests.Session Object...") - if file_path is None: - file_path = _show_saved_sessions() - with open(file_path, "rb") as file: + if path is None: + path = _show_saved_sessions() + with open(path, "rb") as file: headers, cookies = pickle.load(file) session.headers = headers session.cookies = cookies diff --git a/tweeterpy/tweeterpy.py b/tweeterpy/tweeterpy.py index 848074f..4736f41 100644 --- a/tweeterpy/tweeterpy.py +++ b/tweeterpy/tweeterpy.py @@ -4,36 +4,41 @@ import requests import logging.config from functools import reduce +from typing import Union, Dict from tweeterpy import util -from tweeterpy import config from tweeterpy.api_util import ApiUpdater from tweeterpy.tid import ClientTransaction from tweeterpy.login_util import TaskHandler from tweeterpy.request_util import RequestClient from tweeterpy.logging_util import set_log_level -from tweeterpy.constants import Path, FeatureSwitch from tweeterpy.session_util import load_session, save_session +from tweeterpy.constants import Path, FeatureSwitch, LOGGING_CONFIG -logging.config.dictConfig(config.LOGGING_CONFIG) +logging.config.dictConfig(LOGGING_CONFIG) logger = logging.getLogger(__name__) class TweeterPy: - def __init__(self): - if config.DISABLE_LOGS or config.DISABLE_EXTERNAL_LOGS: - logger.debug("Disabling logs...") - config.LOG_LEVEL = "ERROR" if config.DISABLE_LOGS else config.LOG_LEVEL - disable_external_only = config.DISABLE_EXTERNAL_LOGS if not config.DISABLE_LOGS else False - set_log_level(logging.ERROR, external_only=disable_external_only) + def __init__(self, proxies: Dict[str, str] = None, log_level: Union[str, int] = None): + """TweeterPy constructor + + Args: + proxies (dict, optional): Proxies to use. Format {"http":"proxy_here","https":"proxy_here"}. Defaults to None. + log_level (str, optional): Logging level : "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL". Defaults to None. + """ + set_log_level(log_level, external_only=False) + + if proxies and isinstance(proxies, str): + proxies = {'http': proxies, 'https': proxies} + self.proxies = proxies self.request_client: RequestClient = None self.generate_session() # update api endpoints token = self.request_client.session.headers.pop("Authorization") try: - ApiUpdater(request_client=self.request_client, - update_api=config.UPDATE_API) + ApiUpdater(request_client=self.request_client, update_api=True) except Exception as error: logger.warn(error) self.request_client.session.headers.update({"Authorization": token}) @@ -67,7 +72,7 @@ def filter_data(response): if not pagination and total: logger.warn("Either enable the pagination or disable total number of results.") raise Exception("pagination cannot be disabled while the total number of results are specified.") - data_container = {"data": [],"cursor_endpoint": None, "has_next_page": True, "api_rate_limit":config._RATE_LIMIT_STATS} + data_container = {"data": [],"cursor_endpoint": None, "has_next_page": True, "api_rate_limit":None} while data_container["has_next_page"]: try: if end_cursor: @@ -86,8 +91,8 @@ def filter_data(response): if end_cursor: end_cursor = reduce(dict.get, ('content','value'),end_cursor[0]) or reduce(dict.get, ('content','itemContent','value'),end_cursor[0]) data_container['data'].extend(filter_data(data)) - if config._RATE_LIMIT_STATS: - data_container['api_rate_limit'].update(config._RATE_LIMIT_STATS) + + data_container['api_rate_limit'].update({}) print(len(data_container['data']), end="\r") @@ -160,8 +165,8 @@ def generate_session(self, auth_token=None): logger.debug("Trying to generate a new session.") self.request_client = RequestClient(session=requests.Session()) session = self.request_client.session - if config.PROXY is not None: - session.proxies = config.PROXY + if self.proxies: + session.proxies = self.proxies session.verify = False session.headers.update(util.generate_headers()) # home_page = self.request_client.request(Path.BASE_URL) @@ -189,12 +194,13 @@ def generate_session(self, auth_token=None): logger.debug("Session has been generated.") return self.session - def save_session(self, session=None, session_name=None): + def save_session(self, session=None, session_name=None, path=None): """Save a logged in session to avoid frequent logins in future. Args: session (requests.Session, optional): requests.Session object you want to save. If None, saves current session by default. Defaults to None. session_name (str, optional): Session name. If None, uses currently logged in username. Defaults to None. + path (str, optional): Session directory. If None, uses DEFAULT_SESSION_DIRECTORY from constants.py. Defaults to None. Returns: path: Saved session file path. @@ -203,22 +209,20 @@ def save_session(self, session=None, session_name=None): session = self.request_client.session if session_name is None: session_name = self.me['data']['viewer']['user_results']['result']['legacy']['screen_name'] - return save_session(filename=session_name, session=session) + return save_session(filename=session_name, path=path, session=session) - def load_session(self, session_file_path=None, session=None): + def load_session(self, path=None): """Load a saved session. Args: - session_file_path (path, optional): File path to load session from. If None, shows a list of all saved session to choose from. Defaults to None. - session (request.Session, optional): requests.Session object to load a saved session into. Defaults to None. + path (str, optional): Session file path. If None, shows a list of all saved session to choose from. Defaults to None. Returns: requests.Session: Restored session. """ - if session is None: - session = self.generate_session() - self.request_client = RequestClient(session=load_session( - file_path=session_file_path, session=session)) + session = self.generate_session() + self.request_client = RequestClient( + session=load_session(path=path, session=session)) return self.session def logged_in(self): diff --git a/tweeterpy/util.py b/tweeterpy/util.py index a04cc40..7144ce0 100644 --- a/tweeterpy/util.py +++ b/tweeterpy/util.py @@ -6,11 +6,10 @@ from functools import reduce from typing import Dict, List from urllib.parse import urljoin -from tweeterpy import config -from tweeterpy.constants import Path, PUBLIC_TOKEN +from tweeterpy.constants import Path, PUBLIC_TOKEN, LOGGING_CONFIG, USER_AGENT from dataclasses import dataclass, field, fields, asdict, _MISSING_TYPE -logging.config.dictConfig(config.LOGGING_CONFIG) +logging.config.dictConfig(LOGGING_CONFIG) logger = logging.getLogger(__name__) @@ -33,7 +32,7 @@ def generate_headers(session=None, custom_headers=None): "Accept-Language": "en-US,en;q=0.9", "Cache-Control": "no-cache", "Referer": Path.BASE_URL, - "User-Agent": config._USER_AGENT, + "User-Agent": USER_AGENT, "X-Twitter-Active-User": "yes", "X-Twitter-Client-Language": "en" }