Skip to content

Commit

Permalink
feat(connector): add Finna connector
Browse files Browse the repository at this point in the history
Finna is finnish open API that contains among other things books released in finland

terms of use this API is located in https://www.kiwi.fi/display/Finna/Finna+API+Terms+of+Use and is CC0

more info https://www.finna.fi/Content/about_finnafi
  • Loading branch information
ilkka-ollakka committed Jan 30, 2025
1 parent 03bab92 commit 73519bd
Show file tree
Hide file tree
Showing 7 changed files with 3,268 additions and 0 deletions.
304 changes: 304 additions & 0 deletions bookwyrm/connectors/finna.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
"""finna data connector"""

import re
from typing import Iterator

from bookwyrm import models
from bookwyrm.book_search import SearchResult
from bookwyrm.models.book import FormatChoices
from .abstract_connector import AbstractConnector, Mapping, JsonDict
from .abstract_connector import get_data
from .connector_manager import ConnectorException, create_edition_task
from .openlibrary_languages import languages


class Connector(AbstractConnector):
"""instantiate a connector for finna"""

generated_remote_link_field = "id"

def __init__(self, identifier: str):
super().__init__(identifier)

get_first = lambda x, *args: x[0] if x else None
self.book_mappings = [
Mapping("id", remote_field="id"),
Mapping("title", remote_field="title"),
Mapping("subtitle", remote_field="subTitle"),
Mapping("isbn10", remote_field="cleanIsbn"),
Mapping("languages", remote_field="languages", formatter=self.resolve_languages),
Mapping("authors", remote_field="authors", formatter=self.parse_authors),
Mapping("subjects", formatter=self.join_subject_list),
Mapping("publishedDate", remote_field="year"),
Mapping("description", remote_field="summary", formatter=get_first),
Mapping("series", remote_field="series", formatter=self.parse_series_name),
Mapping(
"seriesNumber",
remote_field="series",
formatter=self.parse_series_number,
),
Mapping("publishers", remote_field="publishers"),
Mapping(
"physicalFormat",
remote_field="formats",
formatter=self.describe_physical_format,
),
Mapping(
"physicalFormatDetail",
remote_field="physicalDescriptions",formatter=get_first
),
Mapping(
"pages",
remote_field="physicalDescriptions",formatter=self.guess_page_numbers
),
]

self.author_mappings = [
Mapping("id", remote_field="authors", formatter=self.get_remote_author_id),
Mapping("name", remote_field="authors", formatter=self.get_first_author),
]

def guess_page_numbers(self, data: JsonDict) -> str | None:
for row in data:
# Try to match page count text in style of '134 pages' or '134 sivua' that seems to be common
page_search = re.search(r"(\d+) (sivua|s\.|sidor|pages)", row)
page_count = page_search.group(1) if page_search else None
if page_count:
return page_count
# If we didn't match, try starting number
page_search = re.search(r"^(\d+)", row)
page_count = page_search.group(1) if page_search else None
if page_count:
return page_count


def resolve_languages(self, data: JsonDict) -> list[str]:
result_languages = []
for language_code in data:
result_languages.append(languages.get(f"/languages/{language_code}", language_code))
return result_languages

def join_subject_list(self, data: JsonDict) -> list[str]:
return [" ".join(info) for info in data]

def describe_physical_format(self, formats: JsonDict) -> str:
format = "Hardcover"
# Map finnish finna formats to bookwyrm codes
format_mapping = {
"1/Book/Book/": "Hardcover",
"1/Book/AudioBook/": "AudiobookFormat",
"1/Book/eBook/": "EBook",
}
for format_to_check in formats:
if (
mapping_match := format_mapping.get(format_to_check.get("value"), None)
) is not None:
format = mapping_match
return format

def parse_series_name(self, series: JsonDict) -> str | None:
for info in series:
if "name" in info:
return info.get("name")

def parse_series_number(self, series: JsonDict) -> str | None:
for info in series:
if "additional" in info:
return info.get("additional")

def get_book_data(self, remote_id: str) -> JsonDict:
request_parameters = {
"field[]": [
"authors",
"cleanIsbn",
"formats",
"id",
"languages",
"physicalDescriptions",
"publishers",
"recordPage",
"series",
"shortTitle",
"subjects",
"subTitle",
"summary",
"title",
"year",
]
}
data = get_data(url=remote_id, params=request_parameters)
extracted = data.get("records", [])
try:
data = extracted[0]
except (KeyError, IndexError):
raise ConnectorException("Invalid book data")
return data

def get_remote_author_id(self, data:JsonDict) -> str|None:
author = self.get_first_author(data)
if author:
return f"{self.search_url}{author}&type=Author"

def get_remote_id(self, data:JsonDict) -> str:
return f"{self.books_url}{data.get('id')}"

def get_first_author(self, data: JsonDict) -> str | None:
authors = self.parse_authors(data)
if authors:
return authors[0]
return None

def parse_authors(self, data: JsonDict) -> list[str]:
if author_keys := data.get("primary", None):
if author_keys:
# we search for 'kirjoittaja' role, if any found
tulos = list(
# Convert from 'Lewis, Michael' to 'Michael Lewis'
" ".join(reversed(author_key.split(", ")))
for author_key, author_info in author_keys.items()
if "kirjoittaja" in author_info.get("role", [])
)
if tulos:
return tulos
# if not found, we search any role that is not specificly something (so '-')
tulos = list(
" ".join(reversed(author_key.split(", ")))
for author_key, author_info in author_keys.items()
if "-" in author_info.get("role", [])
)
return tulos
return []

def parse_search_data(
self, data: JsonDict, min_confidence: float
) -> Iterator[SearchResult]:
for idx, search_result in enumerate(data.get("records", [])):
authors = search_result.get("authors")
author = None
if authors:
author_list = self.parse_authors(authors)
if author_list:
author = "; ".join(author_list)

confidence = 1 / (idx + 1)
if confidence < min_confidence:
break

# Create some extra info on edition if it is audio-book or e-book
edition_info_title = self.describe_physical_format(search_result.get("formats"))
edition_info=""
if edition_info_title and edition_info_title != "Hardcover":
for format, info_title in FormatChoices:
if format == edition_info_title:
edition_info= f" {info_title}"
break

search_result = SearchResult(
title=f"{search_result.get('title')}{edition_info}",
key=f"{self.books_url}{search_result.get('id')}",
author=author,
year=search_result.get('year'),
view_link=f"{self.base_url}{search_result.get('recordPage')}",
confidence=confidence,
connector=self,
)
yield search_result

def parse_isbn_search_data(self, data: JsonDict) -> Iterator[SearchResult]:
"""got some data"""
for idx, search_result in enumerate(data.get("records", [])):
authors = search_result.get("authors")
author = None
if authors:
author_list = self.parse_authors(authors)
if author_list:
author = "; ".join(author_list)

confidence = 1 / (idx + 1)
yield SearchResult(
title=search_result.get("title"),
key=f"{self.books_url}{search_result.get('id')}",
author=author,
year=search_result.get('year'),
view_link=f"{self.base_url}{search_result.get('recordPage')}",
confidence=confidence,
connector=self,
)

def get_authors_from_data(self, data: JsonDict) -> Iterator[models.Author]:
authors = data.get("authors")
if authors:
for author in self.parse_authors(authors):
model = self.get_or_create_author(f"{self.search_url}{author}&type=Author")
if model:
yield model

def retrieve_versions(self, book_id: str) -> JsonDict:
"""
https://api.finna.fi/v1/search?id=anders.1946700&search=versions&view=&
"""
request_parameters = {
"id": book_id,
"search": "versions",
"view":"",
"field[]": [
"authors",
"cleanIsbn",
"edition",
"formats",
"id",
"isbns",
"languages",
"physicalDescriptions",
"publishers",
"recordPage",
"series",
"shortTitle",
"subjects",
"subTitle",
"summary",
"title",
"year",
]
}
data = get_data(url="https://api.finna.fi/api/v1/search", params=request_parameters)
return data.get("records", [])

def expand_book_data(self, book: models.Book) -> None:
work = book
# go from the edition to the work, if necessary
if isinstance(book, models.Edition):
work = book.parent_work

try:
edition_options = self.retrieve_versions(work.id)
except ConnectorException:
# who knows, man
return

for edition in edition_options:
remote_id = self.get_remote_id(edition)
if remote_id:
create_edition_task.delay(self.connector.id, work.id, remote_id)

def get_remote_id_from_model(self, obj: models.BookDataModel) -> str:
"""use get_remote_id to figure out the link from a model obj"""
return f"{self.books_url}{obj.id}"

def is_work_data(self, data: JsonDict) -> bool:
"""
https://api.finna.fi/v1/search?id=anders.1946700&search=versions&view=&lng=fi&field[]=formats&field[]=series&field[]=title&field[]=authors&field[]=summary&field[]=cleanIsbn&field[]=id
"""
edition_list = self.retrieve_versions(data.get("id"))
if edition_list:
return data.get("id") == edition_list[0].get("id")
return True

def get_edition_from_work_data(self, data: JsonDict) -> JsonDict:
edition_list = self.retrieve_versions(data.get("id"))
if edition_list:
return edition_list[0]
return data

def get_work_from_edition_data(self, data: JsonDict) -> JsonDict:
return self.retrieve_versions(data.get("id"))[0]
Loading

0 comments on commit 73519bd

Please sign in to comment.