Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes #110

Merged
merged 2 commits into from
Dec 9, 2021
Merged

Fixes #110

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ jobs:
run: |
pip install pytest==6.0.1 pytest-rerunfailures
# do not run tests required auth with secrets
pytest -k 'not cookies' -m 'not github_failed' --reruns 3 --reruns-delay 30
pytest -k 'not cookies' -m 'not github_failed and not rate_limited' --reruns 3 --reruns-delay 30
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,10 @@ As a Python library:
...and many others.

Check [tests file](./tests/test_e2e.py) for extracted data examples, [schemes file](./socid_extractor/schemes.py) to check all supported sites.


## Testing

```sh
python3 -m pytest tests/test_e2e.py -n 10 -k 'not cookies' -m 'not github_failed and not rate_limited'
```
3 changes: 2 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[pytest]
markers =
github_failed: marks tests as failed at GitHub Actions CI (deselect with '-m "not github_failed"')
github_failed: marks tests as failed only at GitHub Actions CI (deselect with '-m "not github_failed"')
rate_limited: marks tests as failed in general because of anti-bot / captcha / rate limiting from the site (deselect with '-m "not rate_limited"')
2 changes: 1 addition & 1 deletion socid_extractor/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from .utils import parse_cookies

HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.3729.169 Safari/537.36',
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
}

Expand Down
69 changes: 49 additions & 20 deletions socid_extractor/schemes.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,14 @@
'following_count': lambda x: x.get('stats', {}).get('subscribersCount'),
}
},
# TODO: rework
'Yandex Market user profile': {
'flags': ['MarketNode', '{"entity":"user"'],
'regex': r'type="application/json">({"widgets":{"@MarketNode/UserReviews".+?)</script>',
'regex': r'>{"widgets":{"@MarketNode/MyArticles/ArticlesGrid.+?"collections":({"publicUser":{"\d+".+?}}})}<',
'extract_json': True,
'transforms': [
json.loads,
lambda x: list(x['collections']['user'].values())[0],
lambda x: list(x['publicUser'].values())[0],
json.dumps,
],
'fields': {
Expand Down Expand Up @@ -440,7 +441,7 @@
'flags': ['OK.startupData'],
'regex': r'path:"/(profile/)?(?P<ok_user_name_id>.+?)",state:".+?friendId=(?P<ok_id>\d+?)"',
},
'Habrahabr': {
'Habrahabr HTML (old)': {
'flags': ['habracdn.net'],
'bs': True,
'fields': {
Expand All @@ -449,6 +450,30 @@
'image': lambda x: 'http:' + x.find('div', {'class': 'user-info__stats'}).find('img').get('src'),
},
},
'Habrahabr JSON': {
'flags': ['habrastorage.org'],
'regex': r'({"authorRefs":{.+?}),"viewport',
'extract_json': True,
'transforms': [
json.loads,
lambda x: list(x['authorRefs'].values())[0],
json.dumps,
],
'fields': {
'username': lambda x: x['alias'],
'about': lambda x: x['speciality'],
'birthday': lambda x: x['birthday'],
'gender': lambda x: x['gender'],
'rating': lambda x: x['rating'],
'karma': lambda x: x['scoreStats']['score'],
'fullname': lambda x: x['fullname'],
'is_readonly': lambda x: x['isReadonly'],
'location': lambda x: x['location'],
'image': lambda x: x['avatarUrl'],
'follower_count': lambda x: x.get('legacy', {}).get('followStats', {}).get('followStats'),
'following_count': lambda x: x.get('legacy', {}).get('followStats', {}).get('followersCount'),
}
},
# unactual
'Twitter HTML': {
'flags': ['abs.twimg.com', 'moreCSSBundles'],
Expand Down Expand Up @@ -690,7 +715,7 @@
'fields': {
'created_at': lambda x: x.get('createdDate'),
'updated_at': lambda x: x.get('modifiedDate'),
'gaia_id': lambda x: x.get('permissions')[1]['id'],
'fake_gaia_id': lambda x: x.get('permissions')[1]['id'],
'fullname': lambda x: x.get('permissions')[1]['name'],
'email': lambda x: x.get('permissions')[1]['emailAddress'],
'image': lambda x: x.get('permissions')[1]['photoLink'],
Expand Down Expand Up @@ -863,26 +888,30 @@
},
'SoundCloud': {
'flags': ['eventlogger.soundcloud.com'],
'regex': r'catch\(e\)\{\}\}\)\},(\[\{"id":.+?)\);',
'regex': r'{"hydratable":"user","data":({.+?)}];',
'extract_json': True,
'message': 'Run with auth cookies to get your ids.',
'transforms': [
json.loads,
json.dumps,
],
'fields': {
'uid': lambda x: x[-1]['data'][0]['id'],
'name': lambda x: x[-1]['data'][0]['full_name'],
'username': lambda x: x[-1]['data'][0]['username'].lstrip('@'),
'following_count': lambda x: x[-1]['data'][0]['followings_count'],
'follower_count': lambda x: x[-1]['data'][0]['followers_count'],
'is_verified': lambda x: x[-1]['data'][0]['verified'],
'image': lambda x: x[-1]['data'][0]['avatar_url'],
'location': lambda x: x[-1]['data'][0]['city'],
'country_code': lambda x: x[-1]['data'][0]['country_code'],
'bio': lambda x: x[-1]['data'][0]['description'],
'created_at': lambda x: x[-1]['data'][0]['created_at'],
'uid': lambda x: x['id'],
'name': lambda x: x['full_name'],
'username': lambda x: x['username'].lstrip('@'),
'following_count': lambda x: x['followings_count'],
'follower_count': lambda x: x['followers_count'],
'is_verified': lambda x: x['verified'],
'image': lambda x: x['avatar_url'],
'location': lambda x: x['city'],
'country_code': lambda x: x['country_code'],
'bio': lambda x: x['description'],
'created_at': lambda x: x['created_at'],
}
},
'TikTok': {
'flags': ['tiktokcdn.com', '__NEXT_DATA__'],
'regex': r'<script id="__NEXT_DATA__" type="application/json" crossorigin="anonymous">(.+?)</script>',
'regex': r'<script id="__NEXT_DATA__"[^>]+>(.+?)</script>',
'extract_json': True,
'transforms': [
json.loads,
Expand Down Expand Up @@ -1538,8 +1567,8 @@
}
},
'ifunny.co': {
'flags': ['"og:site_name" content="iFunny"/>', '"preconnect" href="//img.ifunny.co/'],
'regex': r'window.__INITIAL_STATE__ = (.*);</script> <script>function loadScriptAsync',
'flags': ["gtag('config', 'UA-23094255-1');"],
'regex': r'window.__INITIAL_STATE__=(.+?);',
'extract_json': True,
'transforms': [
json.loads,
Expand All @@ -1550,7 +1579,7 @@
'id': lambda x: x['id'],
'username': lambda x: x['nick'],
'bio': lambda x: x['about'],
'image': lambda x: x['photo']['url'],
'image': lambda x: x['avatar']['url'],
'follower_count': lambda x: x['num']['subscriptions'],
'following_count': lambda x: x['num']['subscribers'],
'post_count': lambda x: x['num']['total_posts'],
Expand Down
2 changes: 1 addition & 1 deletion socid_extractor/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
import math
import re
from datetime import datetime
from datetime import datetime, timezone
from http.cookies import SimpleCookie

def import_cookiejar(filename):
Expand Down
Loading