diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 83d8791..cbac4fe 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -35,4 +35,4 @@ jobs: run: | pip install pytest==6.0.1 pytest-rerunfailures # do not run tests required auth with secrets - pytest -k 'not cookies' -m 'not github_failed' --reruns 3 --reruns-delay 30 + pytest -k 'not cookies' -m 'not github_failed and not rate_limited' --reruns 3 --reruns-delay 30 diff --git a/README.md b/README.md index 5349f40..fca925c 100644 --- a/README.md +++ b/README.md @@ -72,3 +72,10 @@ As a Python library: ...and many others. Check [tests file](./tests/test_e2e.py) for extracted data examples, [schemes file](./socid_extractor/schemes.py) to check all supported sites. + + +## Testing + +```sh +python3 -m pytest tests/test_e2e.py -n 10 -k 'not cookies' -m 'not github_failed and not rate_limited' +``` diff --git a/pytest.ini b/pytest.ini index 0157957..a8719dc 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,4 @@ [pytest] markers = - github_failed: marks tests as failed at GitHub Actions CI (deselect with '-m "not github_failed"') + github_failed: marks tests as failed only at GitHub Actions CI (deselect with '-m "not github_failed"') + rate_limited: marks tests as failed in general because of anti-bot / captcha / rate limiting from the site (deselect with '-m "not rate_limited"') \ No newline at end of file diff --git a/socid_extractor/main.py b/socid_extractor/main.py index 73b2ded..9af7812 100644 --- a/socid_extractor/main.py +++ b/socid_extractor/main.py @@ -6,7 +6,7 @@ from .utils import parse_cookies HEADERS = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.3729.169 Safari/537.36', "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", } diff --git a/socid_extractor/schemes.py b/socid_extractor/schemes.py index bca213c..c15d0eb 100644 --- a/socid_extractor/schemes.py +++ b/socid_extractor/schemes.py @@ -75,13 +75,14 @@ 'following_count': lambda x: x.get('stats', {}).get('subscribersCount'), } }, + # TODO: rework 'Yandex Market user profile': { 'flags': ['MarketNode', '{"entity":"user"'], - 'regex': r'type="application/json">({"widgets":{"@MarketNode/UserReviews".+?)', + 'regex': r'>{"widgets":{"@MarketNode/MyArticles/ArticlesGrid.+?"collections":({"publicUser":{"\d+".+?}}})}<', 'extract_json': True, 'transforms': [ json.loads, - lambda x: list(x['collections']['user'].values())[0], + lambda x: list(x['publicUser'].values())[0], json.dumps, ], 'fields': { @@ -440,7 +441,7 @@ 'flags': ['OK.startupData'], 'regex': r'path:"/(profile/)?(?P.+?)",state:".+?friendId=(?P\d+?)"', }, - 'Habrahabr': { + 'Habrahabr HTML (old)': { 'flags': ['habracdn.net'], 'bs': True, 'fields': { @@ -449,6 +450,30 @@ 'image': lambda x: 'http:' + x.find('div', {'class': 'user-info__stats'}).find('img').get('src'), }, }, + 'Habrahabr JSON': { + 'flags': ['habrastorage.org'], + 'regex': r'({"authorRefs":{.+?}),"viewport', + 'extract_json': True, + 'transforms': [ + json.loads, + lambda x: list(x['authorRefs'].values())[0], + json.dumps, + ], + 'fields': { + 'username': lambda x: x['alias'], + 'about': lambda x: x['speciality'], + 'birthday': lambda x: x['birthday'], + 'gender': lambda x: x['gender'], + 'rating': lambda x: x['rating'], + 'karma': lambda x: x['scoreStats']['score'], + 'fullname': lambda x: x['fullname'], + 'is_readonly': lambda x: x['isReadonly'], + 'location': lambda x: x['location'], + 'image': lambda x: x['avatarUrl'], + 'follower_count': lambda x: x.get('legacy', {}).get('followStats', {}).get('followStats'), + 'following_count': lambda x: x.get('legacy', {}).get('followStats', {}).get('followersCount'), + } + }, # unactual 'Twitter HTML': { 'flags': ['abs.twimg.com', 'moreCSSBundles'], @@ -690,7 +715,7 @@ 'fields': { 'created_at': lambda x: x.get('createdDate'), 'updated_at': lambda x: x.get('modifiedDate'), - 'gaia_id': lambda x: x.get('permissions')[1]['id'], + 'fake_gaia_id': lambda x: x.get('permissions')[1]['id'], 'fullname': lambda x: x.get('permissions')[1]['name'], 'email': lambda x: x.get('permissions')[1]['emailAddress'], 'image': lambda x: x.get('permissions')[1]['photoLink'], @@ -863,26 +888,30 @@ }, 'SoundCloud': { 'flags': ['eventlogger.soundcloud.com'], - 'regex': r'catch\(e\)\{\}\}\)\},(\[\{"id":.+?)\);', + 'regex': r'{"hydratable":"user","data":({.+?)}];', 'extract_json': True, 'message': 'Run with auth cookies to get your ids.', + 'transforms': [ + json.loads, + json.dumps, + ], 'fields': { - 'uid': lambda x: x[-1]['data'][0]['id'], - 'name': lambda x: x[-1]['data'][0]['full_name'], - 'username': lambda x: x[-1]['data'][0]['username'].lstrip('@'), - 'following_count': lambda x: x[-1]['data'][0]['followings_count'], - 'follower_count': lambda x: x[-1]['data'][0]['followers_count'], - 'is_verified': lambda x: x[-1]['data'][0]['verified'], - 'image': lambda x: x[-1]['data'][0]['avatar_url'], - 'location': lambda x: x[-1]['data'][0]['city'], - 'country_code': lambda x: x[-1]['data'][0]['country_code'], - 'bio': lambda x: x[-1]['data'][0]['description'], - 'created_at': lambda x: x[-1]['data'][0]['created_at'], + 'uid': lambda x: x['id'], + 'name': lambda x: x['full_name'], + 'username': lambda x: x['username'].lstrip('@'), + 'following_count': lambda x: x['followings_count'], + 'follower_count': lambda x: x['followers_count'], + 'is_verified': lambda x: x['verified'], + 'image': lambda x: x['avatar_url'], + 'location': lambda x: x['city'], + 'country_code': lambda x: x['country_code'], + 'bio': lambda x: x['description'], + 'created_at': lambda x: x['created_at'], } }, 'TikTok': { 'flags': ['tiktokcdn.com', '__NEXT_DATA__'], - 'regex': r'', + 'regex': r'', 'extract_json': True, 'transforms': [ json.loads, @@ -1538,8 +1567,8 @@ } }, 'ifunny.co': { - 'flags': ['"og:site_name" content="iFunny"/>', '"preconnect" href="//img.ifunny.co/'], - 'regex': r'window.__INITIAL_STATE__ = (.*);