-
-
Notifications
You must be signed in to change notification settings - Fork 267
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(connector): add Finna connector
Finna is finnish open API that contains among other things books released in finland terms of use this API is located in https://www.kiwi.fi/display/Finna/Finna+API+Terms+of+Use and is CC0 more info https://www.finna.fi/Content/about_finnafi
- Loading branch information
1 parent
03bab92
commit 73519bd
Showing
7 changed files
with
3,268 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,304 @@ | ||
"""finna data connector""" | ||
|
||
import re | ||
from typing import Iterator | ||
|
||
from bookwyrm import models | ||
from bookwyrm.book_search import SearchResult | ||
from bookwyrm.models.book import FormatChoices | ||
from .abstract_connector import AbstractConnector, Mapping, JsonDict | ||
from .abstract_connector import get_data | ||
from .connector_manager import ConnectorException, create_edition_task | ||
from .openlibrary_languages import languages | ||
|
||
|
||
class Connector(AbstractConnector): | ||
"""instantiate a connector for finna""" | ||
|
||
generated_remote_link_field = "id" | ||
|
||
def __init__(self, identifier: str): | ||
super().__init__(identifier) | ||
|
||
get_first = lambda x, *args: x[0] if x else None | ||
self.book_mappings = [ | ||
Mapping("id", remote_field="id"), | ||
Mapping("title", remote_field="title"), | ||
Mapping("subtitle", remote_field="subTitle"), | ||
Mapping("isbn10", remote_field="cleanIsbn"), | ||
Mapping("languages", remote_field="languages", formatter=self.resolve_languages), | ||
Mapping("authors", remote_field="authors", formatter=self.parse_authors), | ||
Mapping("subjects", formatter=self.join_subject_list), | ||
Mapping("publishedDate", remote_field="year"), | ||
Mapping("description", remote_field="summary", formatter=get_first), | ||
Mapping("series", remote_field="series", formatter=self.parse_series_name), | ||
Mapping( | ||
"seriesNumber", | ||
remote_field="series", | ||
formatter=self.parse_series_number, | ||
), | ||
Mapping("publishers", remote_field="publishers"), | ||
Mapping( | ||
"physicalFormat", | ||
remote_field="formats", | ||
formatter=self.describe_physical_format, | ||
), | ||
Mapping( | ||
"physicalFormatDetail", | ||
remote_field="physicalDescriptions",formatter=get_first | ||
), | ||
Mapping( | ||
"pages", | ||
remote_field="physicalDescriptions",formatter=self.guess_page_numbers | ||
), | ||
] | ||
|
||
self.author_mappings = [ | ||
Mapping("id", remote_field="authors", formatter=self.get_remote_author_id), | ||
Mapping("name", remote_field="authors", formatter=self.get_first_author), | ||
] | ||
|
||
def guess_page_numbers(self, data: JsonDict) -> str | None: | ||
for row in data: | ||
# Try to match page count text in style of '134 pages' or '134 sivua' that seems to be common | ||
page_search = re.search(r"(\d+) (sivua|s\.|sidor|pages)", row) | ||
page_count = page_search.group(1) if page_search else None | ||
if page_count: | ||
return page_count | ||
# If we didn't match, try starting number | ||
page_search = re.search(r"^(\d+)", row) | ||
page_count = page_search.group(1) if page_search else None | ||
if page_count: | ||
return page_count | ||
|
||
|
||
def resolve_languages(self, data: JsonDict) -> list[str]: | ||
result_languages = [] | ||
for language_code in data: | ||
result_languages.append(languages.get(f"/languages/{language_code}", language_code)) | ||
return result_languages | ||
|
||
def join_subject_list(self, data: JsonDict) -> list[str]: | ||
return [" ".join(info) for info in data] | ||
|
||
def describe_physical_format(self, formats: JsonDict) -> str: | ||
format = "Hardcover" | ||
# Map finnish finna formats to bookwyrm codes | ||
format_mapping = { | ||
"1/Book/Book/": "Hardcover", | ||
"1/Book/AudioBook/": "AudiobookFormat", | ||
"1/Book/eBook/": "EBook", | ||
} | ||
for format_to_check in formats: | ||
if ( | ||
mapping_match := format_mapping.get(format_to_check.get("value"), None) | ||
) is not None: | ||
format = mapping_match | ||
return format | ||
|
||
def parse_series_name(self, series: JsonDict) -> str | None: | ||
for info in series: | ||
if "name" in info: | ||
return info.get("name") | ||
|
||
def parse_series_number(self, series: JsonDict) -> str | None: | ||
for info in series: | ||
if "additional" in info: | ||
return info.get("additional") | ||
|
||
def get_book_data(self, remote_id: str) -> JsonDict: | ||
request_parameters = { | ||
"field[]": [ | ||
"authors", | ||
"cleanIsbn", | ||
"formats", | ||
"id", | ||
"languages", | ||
"physicalDescriptions", | ||
"publishers", | ||
"recordPage", | ||
"series", | ||
"shortTitle", | ||
"subjects", | ||
"subTitle", | ||
"summary", | ||
"title", | ||
"year", | ||
] | ||
} | ||
data = get_data(url=remote_id, params=request_parameters) | ||
extracted = data.get("records", []) | ||
try: | ||
data = extracted[0] | ||
except (KeyError, IndexError): | ||
raise ConnectorException("Invalid book data") | ||
return data | ||
|
||
def get_remote_author_id(self, data:JsonDict) -> str|None: | ||
author = self.get_first_author(data) | ||
if author: | ||
return f"{self.search_url}{author}&type=Author" | ||
|
||
def get_remote_id(self, data:JsonDict) -> str: | ||
return f"{self.books_url}{data.get('id')}" | ||
|
||
def get_first_author(self, data: JsonDict) -> str | None: | ||
authors = self.parse_authors(data) | ||
if authors: | ||
return authors[0] | ||
return None | ||
|
||
def parse_authors(self, data: JsonDict) -> list[str]: | ||
if author_keys := data.get("primary", None): | ||
if author_keys: | ||
# we search for 'kirjoittaja' role, if any found | ||
tulos = list( | ||
# Convert from 'Lewis, Michael' to 'Michael Lewis' | ||
" ".join(reversed(author_key.split(", "))) | ||
for author_key, author_info in author_keys.items() | ||
if "kirjoittaja" in author_info.get("role", []) | ||
) | ||
if tulos: | ||
return tulos | ||
# if not found, we search any role that is not specificly something (so '-') | ||
tulos = list( | ||
" ".join(reversed(author_key.split(", "))) | ||
for author_key, author_info in author_keys.items() | ||
if "-" in author_info.get("role", []) | ||
) | ||
return tulos | ||
return [] | ||
|
||
def parse_search_data( | ||
self, data: JsonDict, min_confidence: float | ||
) -> Iterator[SearchResult]: | ||
for idx, search_result in enumerate(data.get("records", [])): | ||
authors = search_result.get("authors") | ||
author = None | ||
if authors: | ||
author_list = self.parse_authors(authors) | ||
if author_list: | ||
author = "; ".join(author_list) | ||
|
||
confidence = 1 / (idx + 1) | ||
if confidence < min_confidence: | ||
break | ||
|
||
# Create some extra info on edition if it is audio-book or e-book | ||
edition_info_title = self.describe_physical_format(search_result.get("formats")) | ||
edition_info="" | ||
if edition_info_title and edition_info_title != "Hardcover": | ||
for format, info_title in FormatChoices: | ||
if format == edition_info_title: | ||
edition_info= f" {info_title}" | ||
break | ||
|
||
search_result = SearchResult( | ||
title=f"{search_result.get('title')}{edition_info}", | ||
key=f"{self.books_url}{search_result.get('id')}", | ||
author=author, | ||
year=search_result.get('year'), | ||
view_link=f"{self.base_url}{search_result.get('recordPage')}", | ||
confidence=confidence, | ||
connector=self, | ||
) | ||
yield search_result | ||
|
||
def parse_isbn_search_data(self, data: JsonDict) -> Iterator[SearchResult]: | ||
"""got some data""" | ||
for idx, search_result in enumerate(data.get("records", [])): | ||
authors = search_result.get("authors") | ||
author = None | ||
if authors: | ||
author_list = self.parse_authors(authors) | ||
if author_list: | ||
author = "; ".join(author_list) | ||
|
||
confidence = 1 / (idx + 1) | ||
yield SearchResult( | ||
title=search_result.get("title"), | ||
key=f"{self.books_url}{search_result.get('id')}", | ||
author=author, | ||
year=search_result.get('year'), | ||
view_link=f"{self.base_url}{search_result.get('recordPage')}", | ||
confidence=confidence, | ||
connector=self, | ||
) | ||
|
||
def get_authors_from_data(self, data: JsonDict) -> Iterator[models.Author]: | ||
authors = data.get("authors") | ||
if authors: | ||
for author in self.parse_authors(authors): | ||
model = self.get_or_create_author(f"{self.search_url}{author}&type=Author") | ||
if model: | ||
yield model | ||
|
||
def retrieve_versions(self, book_id: str) -> JsonDict: | ||
""" | ||
https://api.finna.fi/v1/search?id=anders.1946700&search=versions&view=& | ||
""" | ||
request_parameters = { | ||
"id": book_id, | ||
"search": "versions", | ||
"view":"", | ||
"field[]": [ | ||
"authors", | ||
"cleanIsbn", | ||
"edition", | ||
"formats", | ||
"id", | ||
"isbns", | ||
"languages", | ||
"physicalDescriptions", | ||
"publishers", | ||
"recordPage", | ||
"series", | ||
"shortTitle", | ||
"subjects", | ||
"subTitle", | ||
"summary", | ||
"title", | ||
"year", | ||
] | ||
} | ||
data = get_data(url="https://api.finna.fi/api/v1/search", params=request_parameters) | ||
return data.get("records", []) | ||
|
||
def expand_book_data(self, book: models.Book) -> None: | ||
work = book | ||
# go from the edition to the work, if necessary | ||
if isinstance(book, models.Edition): | ||
work = book.parent_work | ||
|
||
try: | ||
edition_options = self.retrieve_versions(work.id) | ||
except ConnectorException: | ||
# who knows, man | ||
return | ||
|
||
for edition in edition_options: | ||
remote_id = self.get_remote_id(edition) | ||
if remote_id: | ||
create_edition_task.delay(self.connector.id, work.id, remote_id) | ||
|
||
def get_remote_id_from_model(self, obj: models.BookDataModel) -> str: | ||
"""use get_remote_id to figure out the link from a model obj""" | ||
return f"{self.books_url}{obj.id}" | ||
|
||
def is_work_data(self, data: JsonDict) -> bool: | ||
""" | ||
https://api.finna.fi/v1/search?id=anders.1946700&search=versions&view=&lng=fi&field[]=formats&field[]=series&field[]=title&field[]=authors&field[]=summary&field[]=cleanIsbn&field[]=id | ||
""" | ||
edition_list = self.retrieve_versions(data.get("id")) | ||
if edition_list: | ||
return data.get("id") == edition_list[0].get("id") | ||
return True | ||
|
||
def get_edition_from_work_data(self, data: JsonDict) -> JsonDict: | ||
edition_list = self.retrieve_versions(data.get("id")) | ||
if edition_list: | ||
return edition_list[0] | ||
return data | ||
|
||
def get_work_from_edition_data(self, data: JsonDict) -> JsonDict: | ||
return self.retrieve_versions(data.get("id"))[0] |
Oops, something went wrong.