diff --git a/socid_extractor/schemes.py b/socid_extractor/schemes.py index 38462ea..c15d0eb 100644 --- a/socid_extractor/schemes.py +++ b/socid_extractor/schemes.py @@ -75,13 +75,14 @@ 'following_count': lambda x: x.get('stats', {}).get('subscribersCount'), } }, + # TODO: rework 'Yandex Market user profile': { 'flags': ['MarketNode', '{"entity":"user"'], - 'regex': r'type="application/json">({"widgets":{"@MarketNode/UserReviews".+?)', + 'regex': r'>{"widgets":{"@MarketNode/MyArticles/ArticlesGrid.+?"collections":({"publicUser":{"\d+".+?}}})}<', 'extract_json': True, 'transforms': [ json.loads, - lambda x: list(x['collections']['user'].values())[0], + lambda x: list(x['publicUser'].values())[0], json.dumps, ], 'fields': { @@ -440,7 +441,7 @@ 'flags': ['OK.startupData'], 'regex': r'path:"/(profile/)?(?P.+?)",state:".+?friendId=(?P\d+?)"', }, - 'Habrahabr': { + 'Habrahabr HTML (old)': { 'flags': ['habracdn.net'], 'bs': True, 'fields': { @@ -449,6 +450,30 @@ 'image': lambda x: 'http:' + x.find('div', {'class': 'user-info__stats'}).find('img').get('src'), }, }, + 'Habrahabr JSON': { + 'flags': ['habrastorage.org'], + 'regex': r'({"authorRefs":{.+?}),"viewport', + 'extract_json': True, + 'transforms': [ + json.loads, + lambda x: list(x['authorRefs'].values())[0], + json.dumps, + ], + 'fields': { + 'username': lambda x: x['alias'], + 'about': lambda x: x['speciality'], + 'birthday': lambda x: x['birthday'], + 'gender': lambda x: x['gender'], + 'rating': lambda x: x['rating'], + 'karma': lambda x: x['scoreStats']['score'], + 'fullname': lambda x: x['fullname'], + 'is_readonly': lambda x: x['isReadonly'], + 'location': lambda x: x['location'], + 'image': lambda x: x['avatarUrl'], + 'follower_count': lambda x: x.get('legacy', {}).get('followStats', {}).get('followStats'), + 'following_count': lambda x: x.get('legacy', {}).get('followStats', {}).get('followersCount'), + } + }, # unactual 'Twitter HTML': { 'flags': ['abs.twimg.com', 'moreCSSBundles'], @@ -886,7 +911,7 @@ }, 'TikTok': { 'flags': ['tiktokcdn.com', '__NEXT_DATA__'], - 'regex': r'', + 'regex': r'', 'extract_json': True, 'transforms': [ json.loads, diff --git a/tests/test_e2e.py b/tests/test_e2e.py index cf02b5c..91cc3f4 100755 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -109,10 +109,14 @@ def test_ok(): def test_habr(): info = extract(parse('https://habr.com/ru/users/m1rko/')[0]) - assert info.get('uid') == '1371978' - assert info.get('username') == 'm1rko' - assert info.get('image') == 'http://habrastorage.org/getpro/habr/avatars/4ec/bd0/85d/4ecbd085d692835a931d03174ff19539.png' - + assert info.get("username") == "m1rko" + assert info.get("about") == "автор, переводчик, редактор" + assert info.get("gender") == "0" + assert info.get("rating") == "0" + assert info.get("karma") == "1236.5" + assert info.get("fullname") == "Анатолий Ализар" + assert info.get("is_readonly") == "False" + assert info.get("image") == "//habrastorage.org/getpro/habr/avatars/4ec/bd0/85d/4ecbd085d692835a931d03174ff19539.png" @pytest.mark.github_failed def test_habr_no_image(): @@ -380,7 +384,7 @@ def test_behance(): assert 'appreciations' in info -@pytest.mark.github_failed +@pytest.mark.skip(reason="non-actual, 500px requires POST requests for now") def test_500px(): info = extract(parse('https://api.500px.com/graphql?operationName=ProfileRendererQuery&variables=%7B%22username%22%3A%22the-maksimov%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22105058632482dd2786fd5775745908dc928f537b28e28356b076522757d65c19%22%7D%7D')[0]) @@ -633,7 +637,7 @@ def test_pinterest_api(): assert info.get('is_website_verified') == 'False' assert info.get('follower_count') == '2' assert info.get('group_board_count') == '0' - assert info.get('following_count') == '16' + assert 'following_count' in info assert info.get('board_count') == '11' assert int(info.get('pin_count')) > 100