soxoj · soxoj · Dec 9, 2021 · Dec 8, 2021 · Dec 9, 2021
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -35,4 +35,4 @@ jobs:
       run: |
         pip install pytest==6.0.1 pytest-rerunfailures
         # do not run tests required auth with secrets
-        pytest -k 'not cookies' -m 'not github_failed' --reruns 3 --reruns-delay 30
+        pytest -k 'not cookies' -m 'not github_failed and not rate_limited' --reruns 3 --reruns-delay 30
diff --git a/README.md b/README.md
@@ -72,3 +72,10 @@ As a Python library:
 ...and many others.
 
 Check [tests file](./tests/test_e2e.py) for extracted data examples, [schemes file](./socid_extractor/schemes.py) to check all supported sites.
+
+
+## Testing
+
+```sh
+python3 -m pytest tests/test_e2e.py -n 10  -k 'not cookies' -m 'not github_failed and not rate_limited'
+```
diff --git a/pytest.ini b/pytest.ini
@@ -1,3 +1,4 @@
 [pytest]
 markers =
-    github_failed: marks tests as failed at GitHub Actions CI (deselect with '-m "not github_failed"')
+    github_failed: marks tests as failed only at GitHub Actions CI (deselect with '-m "not github_failed"')
+    rate_limited: marks tests as failed in general because of anti-bot / captcha / rate limiting from the site (deselect with '-m "not rate_limited"')
diff --git a/socid_extractor/main.py b/socid_extractor/main.py
@@ -6,7 +6,7 @@
 from .utils import parse_cookies
 
 HEADERS = {
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.3729.169 Safari/537.36',
     "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
 }
 

diff --git a/socid_extractor/schemes.py b/socid_extractor/schemes.py
@@ -75,13 +75,14 @@
             'following_count': lambda x: x.get('stats', {}).get('subscribersCount'),
         }
     },
+    # TODO: rework
     'Yandex Market user profile': {
         'flags': ['MarketNode', '{"entity":"user"'],
-        'regex': r'type="application/json">({"widgets":{"@MarketNode/UserReviews".+?)</script>',
+        'regex': r'>{"widgets":{"@MarketNode/MyArticles/ArticlesGrid.+?"collections":({"publicUser":{"\d+".+?}}})}<',
         'extract_json': True,
         'transforms': [
             json.loads,
-            lambda x: list(x['collections']['user'].values())[0],
+            lambda x: list(x['publicUser'].values())[0],
             json.dumps,
         ],
         'fields': {
@@ -440,7 +441,7 @@
         'flags': ['OK.startupData'],
         'regex': r'path:"/(profile/)?(?P<ok_user_name_id>.+?)",state:".+?friendId=(?P<ok_id>\d+?)"',
     },
-    'Habrahabr': {
+    'Habrahabr HTML (old)': {
         'flags': ['habracdn.net'],
         'bs': True,
         'fields': {
@@ -449,6 +450,30 @@
             'image': lambda x: 'http:' + x.find('div', {'class': 'user-info__stats'}).find('img').get('src'),
         },
     },
+    'Habrahabr JSON': {
+        'flags': ['habrastorage.org'],
+        'regex': r'({"authorRefs":{.+?}),"viewport',
+        'extract_json': True,
+        'transforms': [
+            json.loads,
+            lambda x: list(x['authorRefs'].values())[0],
+            json.dumps,
+        ],
+        'fields': {
+            'username': lambda x: x['alias'],
+            'about': lambda x: x['speciality'],
+            'birthday': lambda x: x['birthday'],
+            'gender': lambda x: x['gender'],
+            'rating': lambda x: x['rating'],
+            'karma': lambda x: x['scoreStats']['score'],
+            'fullname': lambda x: x['fullname'],
+            'is_readonly': lambda x: x['isReadonly'],
+            'location': lambda x: x['location'],
+            'image': lambda x: x['avatarUrl'],
+            'follower_count': lambda x: x.get('legacy', {}).get('followStats', {}).get('followStats'),
+            'following_count': lambda x: x.get('legacy', {}).get('followStats', {}).get('followersCount'),
+        }
+    },
     # unactual
     'Twitter HTML': {
         'flags': ['abs.twimg.com', 'moreCSSBundles'],
@@ -690,7 +715,7 @@
         'fields': {
             'created_at': lambda x: x.get('createdDate'),
             'updated_at': lambda x: x.get('modifiedDate'),
-            'gaia_id': lambda x: x.get('permissions')[1]['id'],
+            'fake_gaia_id': lambda x: x.get('permissions')[1]['id'],
             'fullname': lambda x: x.get('permissions')[1]['name'],
             'email': lambda x: x.get('permissions')[1]['emailAddress'],
             'image': lambda x: x.get('permissions')[1]['photoLink'],
@@ -863,26 +888,30 @@
     },
     'SoundCloud': {
         'flags': ['eventlogger.soundcloud.com'],
-        'regex': r'catch\(e\)\{\}\}\)\},(\[\{"id":.+?)\);',
+        'regex': r'{"hydratable":"user","data":({.+?)}];',
         'extract_json': True,
         'message': 'Run with auth cookies to get your ids.',
+        'transforms': [
+            json.loads,
+            json.dumps,
+        ],
         'fields': {
-            'uid': lambda x: x[-1]['data'][0]['id'],
-            'name': lambda x: x[-1]['data'][0]['full_name'],
-            'username': lambda x: x[-1]['data'][0]['username'].lstrip('@'),
-            'following_count': lambda x: x[-1]['data'][0]['followings_count'],
-            'follower_count': lambda x: x[-1]['data'][0]['followers_count'],
-            'is_verified': lambda x: x[-1]['data'][0]['verified'],
-            'image': lambda x: x[-1]['data'][0]['avatar_url'],
-            'location': lambda x: x[-1]['data'][0]['city'],
-            'country_code': lambda x: x[-1]['data'][0]['country_code'],
-            'bio': lambda x: x[-1]['data'][0]['description'],
-            'created_at': lambda x: x[-1]['data'][0]['created_at'],
+            'uid': lambda x: x['id'],
+            'name': lambda x: x['full_name'],
+            'username': lambda x: x['username'].lstrip('@'),
+            'following_count': lambda x: x['followings_count'],
+            'follower_count': lambda x: x['followers_count'],
+            'is_verified': lambda x: x['verified'],
+            'image': lambda x: x['avatar_url'],
+            'location': lambda x: x['city'],
+            'country_code': lambda x: x['country_code'],
+            'bio': lambda x: x['description'],
+            'created_at': lambda x: x['created_at'],
         }
     },
     'TikTok': {
         'flags': ['tiktokcdn.com', '__NEXT_DATA__'],
-        'regex': r'<script id="__NEXT_DATA__" type="application/json" crossorigin="anonymous">(.+?)</script>',
+        'regex': r'<script id="__NEXT_DATA__"[^>]+>(.+?)</script>',
         'extract_json': True,
         'transforms': [
             json.loads,
@@ -1538,8 +1567,8 @@
         }
     },
     'ifunny.co': {
-        'flags': ['"og:site_name" content="iFunny"/>', '"preconnect" href="//img.ifunny.co/'],
-        'regex': r'window.__INITIAL_STATE__ = (.*);</script>  <script>function loadScriptAsync',
+        'flags': ["gtag('config', 'UA-23094255-1');"],
+        'regex': r'window.__INITIAL_STATE__=(.+?);',
         'extract_json': True,
         'transforms': [
             json.loads,
@@ -1550,7 +1579,7 @@
             'id': lambda x: x['id'],
             'username': lambda x: x['nick'],
             'bio': lambda x: x['about'],
-            'image': lambda x: x['photo']['url'],
+            'image': lambda x: x['avatar']['url'],
             'follower_count': lambda x: x['num']['subscriptions'],
             'following_count': lambda x: x['num']['subscribers'],
             'post_count': lambda x: x['num']['total_posts'],

diff --git a/socid_extractor/utils.py b/socid_extractor/utils.py
@@ -1,7 +1,7 @@
 import logging
 import math
 import re
-from datetime import datetime
+from datetime import datetime, timezone
 from http.cookies import SimpleCookie
 
 def import_cookiejar(filename):