Skip to content

Commit

Permalink
Merge pull request #39 from soxoj/yandex-updates
Browse files Browse the repository at this point in the history
Yandex updates, tests fixes
  • Loading branch information
soxoj authored Jan 31, 2021
2 parents aa0a025 + 8a39c9a commit 884ebdf
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 7 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## [Unreleased]

## [0.0.5] - 2021-01-31
* updated Yandex services extractors

## [0.0.4] - 2021-01-16
* actualized Vimeo and VK

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
long_description = fh.read()

setup(name='socid-extractor',
version='0.0.4',
version='0.0.5',
description='Extract accounts\' identifiers from personal pages on various platforms',
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
50 changes: 49 additions & 1 deletion socid_extractor/schemes.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,12 @@
'yandex_uid': lambda x: x['owner']['uid'],
'username': lambda x: x['owner']['login'],
'name': lambda x: x['owner']['name'],
'image': lambda x: get_yandex_profile_pic(x['owner']['avatarHash']),
'links': lambda x: [link for links in x['profiles'] for link in links['addresses']],
'is_verified': lambda x: x['verified'],
'liked_albums': lambda x: x['counts']['likedAlbums'],
'liked_artists': lambda x: x['counts']['likedArtists'],
'has_tracks': lambda x: x['hasTracks'],
}
},
'Yandex Znatoki user profile': {
Expand Down Expand Up @@ -111,6 +116,49 @@
'is_business': lambda x: x.get('is_business'),
}
},
'Yandex Reviews user profile': {
'flags': ['isInternalYandexNet', 'ReviewFormContent'],
'regex': r'window.__PRELOADED_DATA = ({[\s\S]+?})\n\s+}catch',
'extract_json': True,
'transforms': [
json.loads,
lambda x: x['pageData']['initialState'],
json.dumps,
],
'fields': {
'yandex_public_id': lambda x: x.get('pkUser', {}).get('publicId'),
'fullname': lambda x: decode_ya_str(x.get('pkUser', {}).get('name')),
'image': lambda x: get_yandex_profile_pic(x.get('pkUser', {}).get('pic')),
'is_verified': lambda x: x.get('pkUser', {}).get('verified'),
'reviews_count': lambda x: len(x.get('reviews', {}).get('all', {}).keys()),
'following_count': lambda x: x.get('subscription', {}).get('subscribersCount'),
'follower_count': lambda x: x.get('subscription', {}).get('subscriptionsCount'),
},
},
'Yandex Zen user profile': {
'flags': ['https://zen.yandex.ru/user/', 'zen-lib'],
'regex': r'\n\s+var data = ({"__[\s\S]+?});\n',
'extract_json': True,
'transforms': [
json.loads,
lambda x: list(filter(lambda y: '__serverState' in y[0], x.items())),
lambda x: x[0][1]['channel']['source'],
json.dumps,
],
'fields': {
'yandex_public_id': lambda x: x.get('publicId'),
'fullname': lambda x: x.get('title'),
'image': lambda x: x.get('logo'),
'bio': lambda x: x.get('description'),
'messenger_guid': lambda x: x.get('messengerGuid'),
'links': lambda x: x.get('socialLinks'),
'type': lambda x: x.get('type'),
'comments_count': lambda x: x.get('userCommentsCount'),
'status': lambda x: x.get('socialProfileStatus'),
'following_count': lambda x: x.get('subscribers'),
'follower_count': lambda x: x.get('subscriptions'),
},
},
'VK user profile': {
'flags': ['Profile.init({', 'change_current_info'],
'regex': r'Profile\.init\({"user_id":(?P<vk_id>\d+).*?(,"loc":"(?P<vk_username>.*?)")?,"back":"(?P<fullname>.*?)"'
Expand Down Expand Up @@ -209,7 +257,7 @@
# https://shadowban.eu/.api/user
# https://gist.github.com/superboum/ab31bc4c85c731b9e89ebda5eaed9a3a
'Twitter Shadowban': {
'flags': ['{"timestamp"', '"profile": {'],
'flags': ['"timestamp"', '"profile": {', 'has_tweets'],
'regex': r'^({.+?})$',
'extract_json': True,
'fields': {
Expand Down
43 changes: 38 additions & 5 deletions tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,18 @@ def test_yandex_disk():
assert info.get('name') == 'Trapl Zdenek'


def test_yandex_reviews():
info = extract(parse('https://reviews.yandex.ru/user/1a7dv00dqrdgjf6qkyn8kw37jw')[0])

assert info.get('yandex_public_id') == '1a7dv00dqrdgjf6qkyn8kw37jw'
assert info.get('fullname') == 'Darya Gindina'
assert info.get('image') == 'https://avatars.mds.yandex.net/get-yapic/59871/oLXpnRHSVknK56vRAYx2Iuya6U-1/islands-200'
assert info.get('is_verified') == 'False'
assert info.get('reviews_count') == '1'
assert info.get('following_count') == '0'
assert info.get('follower_count') == '0'


@pytest.mark.skip(reason="failed from github CI infra IPs")
def test_instagram():
info = extract(parse('https://www.instagram.com/alexaimephotography/')[0])
Expand Down Expand Up @@ -173,7 +185,7 @@ def test_github_api():
info = extract(parse('https://api.github.com/users/soxoj')[0])

assert info.get('uid') == '31013580'
assert info.get('image') == 'https://avatars2.githubusercontent.com/u/31013580?v=4'
assert info.get('image') == 'https://avatars.githubusercontent.com/u/31013580?v=4'
assert info.get('created_at') == '2017-08-14T17:03:07Z'
assert 'follower_count' in info
assert 'following_count' in info
Expand Down Expand Up @@ -216,13 +228,34 @@ def test_my_mail_communities():
assert info.get('isVideoChannel') == 'False'


@pytest.mark.skip(reason="empty result, additional header needed")
def test_yandex_music_user_profile():
info = extract(parse('https://music.yandex.ru/handlers/library.jsx?owner=pritisk')[0])
headers = {'referer': 'https://music.yandex.ru/users/pritisk/playlists'}
info = extract(parse('https://music.yandex.ru/handlers/library.jsx?owner=pritisk', headers=headers)[0])

assert info.get('yandex_uid') == '16480689'
assert info.get('username') == 'pritisk'
assert info.get('name') == 'Юрий Притиск'
assert info.get('image') == 'https://avatars.mds.yandex.net/get-yapic/29310/gK74BTyv8LrLRT0mQFIR2xcWv8-1/islands-200'
assert info.get('links') == '[]'
assert info.get('is_verified') == 'False'
assert info.get('liked_albums') == '0'
assert info.get('liked_artists') == '0'


@pytest.mark.skip(reason="failed from github CI infra IPs")
def test_yandex_zen_user_profile():
info = extract(parse('https://zen.yandex.ru/user/uyawkukxyf60ud6hjrxr2rq130')[0])

assert info.get('yandex_public_id') == 'uyawkukxyf60ud6hjrxr2rq130'
assert info.get('fullname') == 'Нина Кравченко'
assert info.get('image') == 'https://avatars.mds.yandex.net/get-yapic/51169/DKXVQdtL3tZ5cayBXnnicLaKcE-1/islands-200'
assert info.get('messenger_guid') == 'e4615300-548b-9a46-73cf-527d47fe57ed'
assert info.get('links') == '[]'
assert info.get('type') == 'user'
assert int(info.get('comments_count')) > 20
assert info.get('status') == 'active'
assert 'following_count' in info
assert 'follower_count' in info


def test_yandex_znatoki_user_profile():
Expand All @@ -243,7 +276,7 @@ def test_behance():


def test_500px():
info = extract(parse('https://api.500px.com/graphql?operationName=ProfileRendererQuery&variables=%7B%22username%22%3A%22the-maksimov%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%225a17a9af1830b58b94a912995b7947b24f27f1301c6ea8ab71a9eb1a6a86585b%22%7D%7D')[0])
info = extract(parse('https://api.500px.com/graphql?operationName=ProfileRendererQuery&variables=%7B%22username%22%3A%22the-maksimov%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22105058632482dd2786fd5775745908dc928f537b28e28356b076522757d65c19%22%7D%7D')[0])

assert info.get('uid') == 'dXJpOm5vZGU6VXNlcjoyMzg5Ng=='
assert info.get('legacy_id') == '23896'
Expand Down Expand Up @@ -379,7 +412,7 @@ def test_youtube():
def test_google_maps():
info = extract(parse('https://www.google.com/maps/contrib/117503292148966883754')[0])

assert info.get('contribution_level') == 'Level 3 Local Guide | 132 Points'
assert info.get('contribution_level').startswith('Level 3 Local Guide')
assert info.get('name') == 'Art NI'


Expand Down

0 comments on commit 884ebdf

Please sign in to comment.