diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py index d059f060ec9..c60a764ea47 100644 --- a/openlibrary/catalog/add_book/__init__.py +++ b/openlibrary/catalog/add_book/__init__.py @@ -24,6 +24,7 @@ """ import itertools +import json import re from typing import TYPE_CHECKING, Any, Final @@ -40,6 +41,7 @@ from openlibrary import accounts from openlibrary.catalog.utils import ( EARLIEST_PUBLISH_YEAR_FOR_BOOKSELLERS, + get_non_isbn_asin, get_publication_year, is_independently_published, is_promise_item, @@ -487,13 +489,18 @@ def find_quick_match(rec): if ekeys: return ekeys[0] - # only searches for the first value from these lists + # Look for a matching non-ISBN ASIN identifier (e.g. from a BWB promise item). + if (non_isbn_asin := get_non_isbn_asin(rec)) and ( + ekeys := editions_matched(rec, "identifiers.amazon", non_isbn_asin) + ): + return ekeys[0] + + # Only searches for the first value from these lists for f in 'source_records', 'oclc_numbers', 'lccn': if rec.get(f): if f == 'source_records' and not rec[f][0].startswith('ia:'): continue - ekeys = editions_matched(rec, f, rec[f][0]) - if ekeys: + if ekeys := editions_matched(rec, f, rec[f][0]): return ekeys[0] return False @@ -535,6 +542,7 @@ def find_exact_match(rec, edition_pool): continue seen.add(ekey) existing = web.ctx.site.get(ekey) + match = True for k, v in rec.items(): if k == 'source_records': @@ -974,7 +982,33 @@ def should_overwrite_promise_item( return bool(safeget(lambda: edition['source_records'][0], '').startswith("promise")) -def load(rec, account_key=None, from_marc_record: bool = False): +def supplement_rec_with_import_item_metadata( + rec: dict[str, Any], identifier: str +) -> None: + """ + Queries for a staged/pending row in `import_item` by identifier, and if found, uses + select metadata to supplement empty fields/'????' fields in `rec`. + + Changes `rec` in place. + """ + from openlibrary.core.imports import ImportItem # Evade circular import. + + import_fields = [ + 'authors', + 'publish_date', + 'publishers', + 'number_of_pages', + 'physical_format', + ] + + if import_item := ImportItem.find_staged_or_pending([identifier]).first(): + import_item_metadata = json.loads(import_item.get("data", '{}')) + for field in import_fields: + if not rec.get(field) and (staged_field := import_item_metadata.get(field)): + rec[field] = staged_field + + +def load(rec: dict, account_key=None, from_marc_record: bool = False): """Given a record, tries to add/match that edition in the system. Record is a dictionary containing all the metadata of the edition. @@ -993,8 +1027,11 @@ def load(rec, account_key=None, from_marc_record: bool = False): normalize_import_record(rec) - # Resolve an edition if possible, or create and return one if not. + # For recs with a non-ISBN ASIN, supplement the record with BookWorm metadata. + if non_isbn_asin := get_non_isbn_asin(rec): + supplement_rec_with_import_item_metadata(rec=rec, identifier=non_isbn_asin) + # Resolve an edition if possible, or create and return one if not. edition_pool = build_pool(rec) if not edition_pool: # No match candidates found, add edition diff --git a/openlibrary/catalog/utils/__init__.py b/openlibrary/catalog/utils/__init__.py index 1f25c5e12f2..fbe758804f4 100644 --- a/openlibrary/catalog/utils/__init__.py +++ b/openlibrary/catalog/utils/__init__.py @@ -340,6 +340,15 @@ def needs_isbn_and_lacks_one(rec: dict) -> bool: """ def needs_isbn(rec: dict) -> bool: + # Exception for Amazon-specific ASINs, which often accompany ebooks + if any( + name == "amazon" and identifier.startswith("B") + for record in rec.get("source_records", []) + if record and ":" in record + for name, identifier in [record.split(":", 1)] + ): + return False + return any( record.split(":")[0] in BOOKSELLERS_WITH_ADDITIONAL_VALIDATION for record in rec.get('source_records', []) @@ -359,6 +368,52 @@ def is_promise_item(rec: dict) -> bool: ) +def get_non_isbn_asin(rec: dict) -> str | None: + """ + Return a non-ISBN ASIN (e.g. B012345678) if one exists. + + There is a tacit assumption that at most one will exist. + """ + # Look first in identifiers. + amz_identifiers = rec.get("identifiers", {}).get("amazon", []) + if asin := next( + (identifier for identifier in amz_identifiers if identifier.startswith("B")), + None, + ): + return asin + + # Finally, check source_records. + if asin := next( + ( + record.split(":")[-1] + for record in rec.get("source_records", []) + if record.startswith("amazon:B") + ), + None, + ): + return asin + + return None + + +def is_asin_only(rec: dict) -> bool: + """Returns True if the rec has only an ASIN and no ISBN, and False otherwise.""" + # Immediately return False if any ISBNs are present + if any(isbn_type in rec for isbn_type in ("isbn_10", "isbn_13")): + return False + + # Check for Amazon source records starting with "B". + if any(record.startswith("amazon:B") for record in rec.get("source_records", [])): + return True + + # Check for Amazon identifiers starting with "B". + amz_identifiers = rec.get("identifiers", {}).get("amazon", []) + if any(identifier.startswith("B") for identifier in amz_identifiers): + return True + + return False + + def get_missing_fields(rec: dict) -> list[str]: """Return missing fields, if any.""" required_fields = [ diff --git a/openlibrary/core/models.py b/openlibrary/core/models.py index b109ff4a629..285be446224 100644 --- a/openlibrary/core/models.py +++ b/openlibrary/core/models.py @@ -373,10 +373,35 @@ def get_ia_download_link(self, suffix): if filename: return f"https://archive.org/download/{self.ocaid}/{filename}" + @staticmethod + def get_isbn_or_asin(isbn_or_asin: str) -> tuple[str, str]: + """ + Return a tuple with an ISBN or an ASIN, accompanied by an empty string. + If the identifier is an ISBN, it appears in index 0. + If the identifier is an ASIN, it appears in index 1. + """ + isbn = canonical(isbn_or_asin) + asin = isbn_or_asin.upper() if isbn_or_asin.upper().startswith("B") else "" + return (isbn, asin) + + @staticmethod + def is_valid_identifier(isbn: str, asin: str) -> bool: + """Return `True` if there is a valid identifier.""" + return len(isbn) in [10, 13] or len(asin) == 10 + + @staticmethod + def get_identifier_forms(isbn: str, asin: str) -> list[str]: + """Make a list of ISBN 10, ISBN 13, and ASIN, insofar as each is available.""" + isbn_13 = to_isbn_13(isbn) + isbn_10 = isbn_13_to_isbn_10(isbn_13) if isbn_13 else None + return [id_ for id_ in [isbn_10, isbn_13, asin] if id_] + @classmethod - def from_isbn(cls, isbn: str, high_priority: bool = False) -> "Edition | None": + def from_isbn( + cls, isbn_or_asin: str, high_priority: bool = False + ) -> "Edition | None": """ - Attempts to fetch an edition by ISBN, or if no edition is found, then + Attempts to fetch an edition by ISBN or ASIN, or if no edition is found, then check the import_item table for a match, then as a last result, attempt to import from Amazon. :param bool high_priority: If `True`, (1) any AMZ import requests will block @@ -386,28 +411,27 @@ def from_isbn(cls, isbn: str, high_priority: bool = False) -> "Edition | None": server will return a promise. :return: an open library edition for this ISBN or None. """ - isbn = canonical(isbn) - - if len(isbn) not in [10, 13]: - return None # consider raising ValueError + # Determine if we've got an ISBN or ASIN and if it's facially valid. + isbn, asin = cls.get_isbn_or_asin(isbn_or_asin) + if not cls.is_valid_identifier(isbn=isbn, asin=asin): + return None - isbn13 = to_isbn_13(isbn) - if isbn13 is None: - return None # consider raising ValueError - isbn10 = isbn_13_to_isbn_10(isbn13) - isbns = [isbn13, isbn10] if isbn10 is not None else [isbn13] + # Create a list of ISBNs (or an ASIN) to match. + if not (book_ids := cls.get_identifier_forms(isbn=isbn, asin=asin)): + return None # Attempt to fetch book from OL - for isbn in isbns: - if isbn: - matches = web.ctx.site.things( - {"type": "/type/edition", 'isbn_%s' % len(isbn): isbn} - ) - if matches: - return web.ctx.site.get(matches[0]) + for book_id in book_ids: + if book_id == asin: + query = {"type": "/type/edition", 'identifiers': {'amazon': asin}} + else: + query = {"type": "/type/edition", 'isbn_%s' % len(book_id): book_id} + + if matches := web.ctx.site.things(query): + return web.ctx.site.get(matches[0]) # Attempt to fetch the book from the import_item table - if edition := ImportItem.import_first_staged(identifiers=isbns): + if edition := ImportItem.import_first_staged(identifiers=book_ids): return edition # Finally, try to fetch the book data from Amazon + import. @@ -415,14 +439,14 @@ def from_isbn(cls, isbn: str, high_priority: bool = False) -> "Edition | None": # uses, will block + wait until the Product API responds and the result, if any, # is staged in `import_item`. try: - get_amazon_metadata( - id_=isbn10 or isbn13, id_type="isbn", high_priority=high_priority - ) - return ImportItem.import_first_staged(identifiers=isbns) + id_ = asin or book_ids[0] + id_type = "asin" if asin else "isbn" + get_amazon_metadata(id_=id_, id_type=id_type, high_priority=high_priority) + return ImportItem.import_first_staged(identifiers=book_ids) except requests.exceptions.ConnectionError: logger.exception("Affiliate Server unreachable") except requests.exceptions.HTTPError: - logger.exception(f"Affiliate Server: id {isbn10 or isbn13} not found") + logger.exception(f"Affiliate Server: id {id_} not found") return None def is_ia_scan(self): diff --git a/openlibrary/core/vendors.py b/openlibrary/core/vendors.py index 7d556475b19..743900108fb 100644 --- a/openlibrary/core/vendors.py +++ b/openlibrary/core/vendors.py @@ -242,13 +242,16 @@ def serialize(product: Any) -> dict: logger.exception(f"serialize({product})") publish_date = None + asin_is_isbn10 = not product.asin.startswith("B") + isbn_13 = isbn_10_to_isbn_13(product.asin) if asin_is_isbn10 else None + book = { 'url': "https://www.amazon.com/dp/{}/?tag={}".format( product.asin, h.affiliate_id('amazon') ), 'source_records': ['amazon:%s' % product.asin], - 'isbn_10': [product.asin], - 'isbn_13': [isbn_10_to_isbn_13(product.asin)], + 'isbn_10': [product.asin] if asin_is_isbn10 else [], + 'isbn_13': [isbn_13] if isbn_13 else [], 'price': price and price.display_amount, 'price_amt': price and price.amount and int(100 * price.amount), 'title': ( @@ -297,6 +300,7 @@ def get_amazon_metadata( id_type: Literal['asin', 'isbn'] = 'isbn', resources: Any = None, high_priority: bool = False, + stage_import: bool = True, ) -> dict | None: """Main interface to Amazon LookupItem API. Will cache results. @@ -304,6 +308,7 @@ def get_amazon_metadata( :param str id_type: 'isbn' or 'asin'. :param bool high_priority: Priority in the import queue. High priority goes to the front of the queue. + param bool stage_import: stage the id_ for import if not in the cache. :return: A single book item's metadata, or None. """ return cached_get_amazon_metadata( @@ -311,6 +316,7 @@ def get_amazon_metadata( id_type=id_type, resources=resources, high_priority=high_priority, + stage_import=stage_import, ) @@ -329,6 +335,7 @@ def _get_amazon_metadata( id_type: Literal['asin', 'isbn'] = 'isbn', resources: Any = None, high_priority: bool = False, + stage_import: bool = True, ) -> dict | None: """Uses the Amazon Product Advertising API ItemLookup operation to locate a specific book by identifier; either 'isbn' or 'asin'. @@ -340,6 +347,7 @@ def _get_amazon_metadata( See https://webservices.amazon.com/paapi5/documentation/get-items.html :param bool high_priority: Priority in the import queue. High priority goes to the front of the queue. + param bool stage_import: stage the id_ for import if not in the cache. :return: A single book item's metadata, or None. """ if not affiliate_server_url: @@ -358,8 +366,9 @@ def _get_amazon_metadata( try: priority = "true" if high_priority else "false" + stage = "true" if stage_import else "false" r = requests.get( - f'http://{affiliate_server_url}/isbn/{id_}?high_priority={priority}' + f'http://{affiliate_server_url}/isbn/{id_}?high_priority={priority}&stage_import={stage}' ) r.raise_for_status() if data := r.json().get('hit'): diff --git a/openlibrary/plugins/books/dynlinks.py b/openlibrary/plugins/books/dynlinks.py index 92c7d76b884..f9d38fc3b8c 100644 --- a/openlibrary/plugins/books/dynlinks.py +++ b/openlibrary/plugins/books/dynlinks.py @@ -1,10 +1,13 @@ from typing import Any +import importlib import json import sys -from collections.abc import Generator, Hashable, Iterable, Mapping +from collections.abc import Hashable, Iterable, Mapping import web + from openlibrary.core.models import Edition +from openlibrary.core.imports import ImportItem from openlibrary.plugins.openlibrary.processors import urlsafe from openlibrary.core import helpers as h @@ -13,6 +16,11 @@ from infogami.utils.delegate import register_exception +# Import from manage-imports, but get around hyphen problem. +imports_module = "scripts.manage-imports" +manage_imports = importlib.import_module(imports_module) + + def split_key(bib_key: str) -> tuple[str | None, str | None]: """ >>> split_key('1234567890') @@ -454,34 +462,42 @@ def is_isbn(bib_key: str) -> bool: return len(bib_key) in {10, 13} -def get_missed_isbn_bib_keys( - bib_keys: Iterable[str], found_records: dict -) -> Generator[str, None, None]: +def get_missed_isbn_bib_keys(bib_keys: Iterable[str], found_records: dict) -> list[str]: """ Return a Generator[str, None, None] with all ISBN bib_keys not in `found_records`. """ - return ( + return [ bib_key for bib_key in bib_keys if bib_key not in found_records and is_isbn(bib_key) - ) + ] def get_isbn_editiondict_map( - isbns: Iterable, high_priority: bool = False + isbns: Iterable[str], high_priority: bool = False ) -> dict[str, Any]: """ Attempts to import items from their ISBN, returning a mapping of possibly imported edition_dicts in the following form: {isbn_string: edition_dict...}} """ + # Supplement non-ISBN ASIN records with BookWorm metadata for that ASIN. + if high_priority: + for isbn in isbns: + if not isbn.upper().startswith("B"): + continue + + if item_to_import := ImportItem.find_staged_or_pending([isbn]).first(): + item_edition = ImportItem(item_to_import) + manage_imports.do_import(item_edition, require_marc=False) + # Get a mapping of ISBNs to new Editions (or `None`) isbn_edition_map = { - isbn: Edition.from_isbn(isbn=isbn, high_priority=high_priority) + isbn: Edition.from_isbn(isbn_or_asin=isbn, high_priority=high_priority) for isbn in isbns } - # Convert edictions to dicts, dropping ISBNs for which no edition was created. + # Convert editions to dicts, dropping ISBNs for which no edition was created. return { isbn: edition.dict() for isbn, edition in isbn_edition_map.items() if edition } @@ -505,15 +521,17 @@ def dynlinks(bib_keys: Iterable[str], options: web.storage) -> str: # for backward-compatibility if options.get("details", "").lower() == "true": options["jscmd"] = "details" + high_priority = options.get("high_priority", False) try: edition_dicts = query_docs(bib_keys) + # For any ISBN bib_keys without hits, attempt to import+use immediately if # `high_priority`. Otherwise, queue them for lookup via the AMZ Products # API and process whatever editions were found in existing data. if missed_isbns := get_missed_isbn_bib_keys(bib_keys, edition_dicts): new_editions = get_isbn_editiondict_map( - isbns=missed_isbns, high_priority=options.get("high_priority") + isbns=missed_isbns, high_priority=high_priority ) edition_dicts.update(new_editions) edition_dicts = process_result(edition_dicts, options.get('jscmd')) diff --git a/openlibrary/plugins/importapi/code.py b/openlibrary/plugins/importapi/code.py index b023627168a..b8a4ad51147 100644 --- a/openlibrary/plugins/importapi/code.py +++ b/openlibrary/plugins/importapi/code.py @@ -18,7 +18,7 @@ LanguageMultipleMatchError, get_location_and_publisher, ) -from openlibrary.utils.isbn import get_isbn_10_and_13 +from openlibrary.utils.isbn import get_isbn_10s_and_13s import web @@ -344,7 +344,7 @@ def get_ia_record(metadata: dict) -> dict: if description: d['description'] = description if unparsed_isbns: - isbn_10, isbn_13 = get_isbn_10_and_13(unparsed_isbns) + isbn_10, isbn_13 = get_isbn_10s_and_13s(unparsed_isbns) if isbn_10: d['isbn_10'] = isbn_10 if isbn_13: diff --git a/openlibrary/plugins/openlibrary/code.py b/openlibrary/plugins/openlibrary/code.py index c43c9a40967..c7574c04de3 100644 --- a/openlibrary/plugins/openlibrary/code.py +++ b/openlibrary/plugins/openlibrary/code.py @@ -482,9 +482,9 @@ def remove_high_priority(query: str) -> str: class isbn_lookup(delegate.page): path = r'/(?:isbn|ISBN)/(.{10,})' - def GET(self, isbn): + def GET(self, isbn: str): input = web.input(high_priority=False) - isbn = canonical(isbn) + isbn = isbn if isbn.upper().startswith("B") else canonical(isbn) high_priority = input.get("high_priority") == "true" if "high_priority" in web.ctx.env.get('QUERY_STRING'): web.ctx.env['QUERY_STRING'] = remove_high_priority( @@ -499,7 +499,7 @@ def GET(self, isbn): ext += '?' + web.ctx.env['QUERY_STRING'] try: - if ed := Edition.from_isbn(isbn=isbn, high_priority=high_priority): + if ed := Edition.from_isbn(isbn_or_asin=isbn, high_priority=high_priority): return web.found(ed.key + ext) except Exception as e: logger.error(e) diff --git a/openlibrary/tests/catalog/test_utils.py b/openlibrary/tests/catalog/test_utils.py index 318090546be..f5107d08f5e 100644 --- a/openlibrary/tests/catalog/test_utils.py +++ b/openlibrary/tests/catalog/test_utils.py @@ -5,7 +5,9 @@ author_dates_match, flip_name, get_missing_fields, + get_non_isbn_asin, get_publication_year, + is_asin_only, is_independently_published, is_promise_item, match_with_bad_chars, @@ -323,6 +325,44 @@ def test_is_promise_item(rec, expected) -> None: assert is_promise_item(rec) == expected +@pytest.mark.parametrize( + ["rec", "expected"], + [ + ({"source_records": ["amazon:B01234568"]}, "B01234568"), + ({"source_records": ["amazon:123456890"]}, None), + ({"source_records": ["ia:BLOB"]}, None), + ({"source_records": []}, None), + ({"identifiers": {"ia": ["B01234568"]}}, None), + ({"identifiers": {"amazon": ["123456890"]}}, None), + ({"identifiers": {"amazon": ["B01234568"]}}, "B01234568"), + ({"identifiers": {"amazon": []}}, None), + ({"identifiers": {}}, None), + ({}, None), + ], +) +def test_get_non_isbn_asin(rec, expected) -> None: + got = get_non_isbn_asin(rec) + assert got == expected + + +@pytest.mark.parametrize( + ["rec", "expected"], + [ + ({"isbn_10": "123456890", "source_records": ["amazon:B01234568"]}, False), + ({"isbn_13": "1234567890123", "source_records": ["amazon:B01234568"]}, False), + ({"isbn_10": "1234567890", "identifiers": {"amazon": ["B01234568"]}}, False), + ({"source_records": ["amazon:1234567890"]}, False), + ({"identifiers": {"amazon": ["123456890"]}}, False), + ({}, False), + ({"identifiers": {"amazon": ["B01234568"]}}, True), + ({"source_records": ["amazon:B01234568"]}, True), + ], +) +def test_is_asin_only(rec, expected) -> None: + got = is_asin_only(rec) + assert got == expected + + @pytest.mark.parametrize( 'name, rec, expected', [ diff --git a/openlibrary/tests/core/test_models.py b/openlibrary/tests/core/test_models.py index 8c054052f57..0b49e2082dd 100644 --- a/openlibrary/tests/core/test_models.py +++ b/openlibrary/tests/core/test_models.py @@ -1,5 +1,7 @@ from openlibrary.core import models +import pytest + class MockSite: def get(self, key): @@ -57,6 +59,58 @@ def test_not_in_borrowable_collection_cuz_in_private_collection(self): e = self.mock_edition(MockPrivateEdition) assert not e.in_borrowable_collection() + @pytest.mark.parametrize( + ["isbn_or_asin", "expected"], + [ + ("1111111111", ("1111111111", "")), # ISBN 10 + ("9780747532699", ("9780747532699", "")), # ISBN 13 + ("B06XYHVXVJ", ("", "B06XYHVXVJ")), # ASIN + ("b06xyhvxvj", ("", "B06XYHVXVJ")), # Lower case ASIN + ("", ("", "")), # Nothing at all. + ], + ) + def test_get_isbn_or_asin(self, isbn_or_asin, expected) -> None: + e: models.Edition = self.mock_edition(MockPrivateEdition) + got = e.get_isbn_or_asin(isbn_or_asin) + assert got == expected + + @pytest.mark.parametrize( + ["isbn", "asin", "expected"], + [ + ("1111111111", "", True), # ISBN 10 + ("", "B06XYHVXVJ", True), # ASIN + ("9780747532699", "", True), # ISBN 13 + ("0", "", False), # Invalid ISBN length + ("", "0", False), # Invalid ASIN length + ("", "", False), # Nothing at all. + ], + ) + def test_is_valid_identifier(self, isbn, asin, expected) -> None: + e: models.Edition = self.mock_edition(MockPrivateEdition) + got = e.is_valid_identifier(isbn=isbn, asin=asin) + assert got == expected + + @pytest.mark.parametrize( + ["isbn", "asin", "expected"], + [ + ("1111111111", "", ["1111111111", "9781111111113"]), + ("9780747532699", "", ["0747532699", "9780747532699"]), + ("", "B06XYHVXVJ", ["B06XYHVXVJ"]), + ( + "9780747532699", + "B06XYHVXVJ", + ["0747532699", "9780747532699", "B06XYHVXVJ"], + ), + ("", "", []), + ], + ) + def test_get_identifier_forms( + self, isbn: str, asin: str, expected: list[str] + ) -> None: + e: models.Edition = self.mock_edition(MockPrivateEdition) + got = e.get_identifier_forms(isbn=isbn, asin=asin) + assert got == expected + class TestAuthor: def test_url(self): diff --git a/openlibrary/utils/isbn.py b/openlibrary/utils/isbn.py index 6eff9881905..d36a27cf1e4 100644 --- a/openlibrary/utils/isbn.py +++ b/openlibrary/utils/isbn.py @@ -37,7 +37,7 @@ def check_digit_13(isbn): return str(r) -def isbn_13_to_isbn_10(isbn_13) -> str | None: +def isbn_13_to_isbn_10(isbn_13: str) -> str | None: isbn_13 = canonical(isbn_13) if ( len(isbn_13) != 13 @@ -49,14 +49,14 @@ def isbn_13_to_isbn_10(isbn_13) -> str | None: return isbn_13[3:-1] + check_digit_10(isbn_13[3:-1]) -def isbn_10_to_isbn_13(isbn_10): +def isbn_10_to_isbn_13(isbn_10: str) -> str | None: isbn_10 = canonical(isbn_10) if ( len(isbn_10) != 10 or not isbn_10[:-1].isdigit() or check_digit_10(isbn_10[:-1]) != isbn_10[-1] ): - return + return None isbn_13 = '978' + isbn_10[:-1] return isbn_13 + check_digit_13(isbn_13) @@ -85,7 +85,33 @@ def normalize_isbn(isbn: str) -> str | None: return isbn and canonical(isbn) or None -def get_isbn_10_and_13(isbns: str | list[str]) -> tuple[list[str], list[str]]: +def get_isbn_10_and_13(isbn: str) -> tuple[str | None, str | None]: + """ + Takes an ISBN 10 or 13 and returns an ISBN optional ISBN 10 and an ISBN 13, + both in canonical form. + """ + if canonical_isbn := normalize_isbn(isbn): + isbn_13 = ( + canonical_isbn if len(canonical_isbn) == 13 else isbn_10_to_isbn_13(isbn) + ) + isbn_10 = isbn_13_to_isbn_10(isbn_13) if isbn_13 else canonical_isbn + return isbn_10, isbn_13 + + return None, None + + +def normalize_identifier( + identifier: str, +) -> tuple[str | None, str | None, str | None]: + """ + Takes an identifier (e.g. an ISBN 10/13 or B* ASIN) and returns a tuple of: + ASIN, ISBN_10, ISBN_13 or None, with the ISBNs in canonical form. + """ + asin = identifier.upper() if identifier.upper().startswith("B") else None + return asin, *get_isbn_10_and_13(identifier) + + +def get_isbn_10s_and_13s(isbns: str | list[str]) -> tuple[list[str], list[str]]: """ Returns a tuple of list[isbn_10_strings], list[isbn_13_strings] @@ -93,9 +119,9 @@ def get_isbn_10_and_13(isbns: str | list[str]) -> tuple[list[str], list[str]]: with no differentiation between ISBN 10 and ISBN 13. Open Library records need ISBNs in `isbn_10` and `isbn_13` fields. - >>> get_isbn_10_and_13('1576079457') + >>> get_isbn_10s_and_13s('1576079457') (['1576079457'], []) - >>> get_isbn_10_and_13(['1576079457', '9781576079454', '1576079392']) + >>> get_isbn_10s_and_13s(['1576079457', '9781576079454', '1576079392']) (['1576079457', '1576079392'], ['9781576079454']) Notes: diff --git a/openlibrary/utils/tests/test_isbn.py b/openlibrary/utils/tests/test_isbn.py index 9de7bded3be..07bb6dc4ca2 100644 --- a/openlibrary/utils/tests/test_isbn.py +++ b/openlibrary/utils/tests/test_isbn.py @@ -1,10 +1,12 @@ import pytest from openlibrary.utils.isbn import ( + get_isbn_10_and_13, isbn_10_to_isbn_13, isbn_13_to_isbn_10, + normalize_identifier, normalize_isbn, opposite_isbn, - get_isbn_10_and_13, + get_isbn_10s_and_13s, ) @@ -50,33 +52,62 @@ def test_normalize_isbn(isbnlike, expected): assert normalize_isbn(isbnlike) == expected -def test_get_isbn_10_and_13() -> None: +def test_get_isbn_10s_and_13s() -> None: # isbn 10 only - result = get_isbn_10_and_13(["1576079457"]) + result = get_isbn_10s_and_13s(["1576079457"]) assert result == (["1576079457"], []) # isbn 13 only - result = get_isbn_10_and_13(["9781576079454"]) + result = get_isbn_10s_and_13s(["9781576079454"]) assert result == ([], ["9781576079454"]) # mixed isbn 10 and 13, with multiple elements in each, one which has an extra space. - result = get_isbn_10_and_13( + result = get_isbn_10s_and_13s( ["9781576079454", "1576079457", "1576079392 ", "9781280711190"] ) assert result == (["1576079457", "1576079392"], ["9781576079454", "9781280711190"]) # an empty list - result = get_isbn_10_and_13([]) + result = get_isbn_10s_and_13s([]) assert result == ([], []) # not an isbn - result = get_isbn_10_and_13(["flop"]) + result = get_isbn_10s_and_13s(["flop"]) assert result == ([], []) # isbn 10 string, with an extra space. - result = get_isbn_10_and_13(" 1576079457") + result = get_isbn_10s_and_13s(" 1576079457") assert result == (["1576079457"], []) # isbn 13 string - result = get_isbn_10_and_13("9781280711190") + result = get_isbn_10s_and_13s("9781280711190") assert result == ([], ["9781280711190"]) + + +@pytest.mark.parametrize( + ["isbn", "expected"], + [ + ("1111111111", ("1111111111", "9781111111113")), + ("9781111111113", ("1111111111", "9781111111113")), + ("979-1-23456-789-6", (None, "9791234567896")), + ("", (None, None)), + (None, (None, None)), + ], +) +def test_get_isbn_10_and_13(isbn, expected) -> None: + got = get_isbn_10_and_13(isbn) + assert got == expected + + +@pytest.mark.parametrize( + ["identifier", "expected"], + [ + ("B01234678", ("B01234678", None, None)), + ("1111111111", (None, "1111111111", "9781111111113")), + ("9781111111113", (None, "1111111111", "9781111111113")), + ("", (None, None, None)), + ], +) +def test_normalize_identifier(identifier, expected) -> None: + got = normalize_identifier(identifier) + assert got == expected diff --git a/scripts/affiliate_server.py b/scripts/affiliate_server.py index c5a20ce1ce0..195ed85e2d8 100644 --- a/scripts/affiliate_server.py +++ b/scripts/affiliate_server.py @@ -42,10 +42,11 @@ import threading import time +from collections.abc import Collection from dataclasses import dataclass, field from datetime import datetime from enum import Enum -from typing import Final +from typing import Any, Final import web @@ -59,6 +60,7 @@ from openlibrary.core.vendors import AmazonAPI, clean_amazon_metadata_for_load from openlibrary.utils.dateutil import WEEK_SECS from openlibrary.utils.isbn import ( + normalize_identifier, normalize_isbn, isbn_13_to_isbn_10, isbn_10_to_isbn_13, @@ -68,7 +70,7 @@ # fmt: off urls = ( - '/isbn/([0-9X-]+)', 'Submit', + '/isbn/([bB]?[0-9a-zA-Z-]+)', 'Submit', '/status', 'Status', '/clear', 'Clear', ) @@ -94,6 +96,70 @@ web.amazon_lookup_thread = None +class Priority(Enum): + """ + Priority for the `PrioritizedIdentifier` class. + + `queue.PriorityQueue` has a lowest-value-is-highest-priority system, but + setting `PrioritizedIdentifier.priority` to 0 can make it look as if priority is + disabled. Using an `Enum` can help with that. + """ + + HIGH = 0 + LOW = 1 + + def __lt__(self, other): + if isinstance(other, Priority): + return self.value < other.value + return NotImplemented + + +@dataclass(order=True, slots=True) +class PrioritizedIdentifier: + """ + Represent an identifiers's priority in the queue. Sorting is based on the `priority` + attribute, then the `timestamp` to solve tie breaks within a specific priority, + with priority going to whatever `min([items])` would return. + For more, see https://docs.python.org/3/library/queue.html#queue.PriorityQueue. + + Therefore, priority 0, which is equivalent to `Priority.HIGH`, is the highest + priority. + + This exists so certain identifiers can go to the front of the queue for faster + processing as their look-ups are time sensitive and should return look up data + to the caller (e.g. interactive API usage through `/isbn`). + """ + + identifier: str = field(compare=False) + """identifier is an ISBN 13 or B* ASIN.""" + stage_import: bool = True + """Whether to stage the item for import.""" + priority: Priority = field(default=Priority.LOW) + timestamp: datetime = field(default_factory=datetime.now) + + def __hash__(self): + """Only consider the `identifier` attribute when hashing (e.g. for `set` uniqueness).""" + return hash(self.identifier) + + def __eq__(self, other): + """Two instances of PrioritizedIdentifier are equal if their `identifier` attribute is equal.""" + if isinstance(other, PrioritizedIdentifier): + return self.identifier == other.identifier + return False + + def to_dict(self): + """ + Convert the PrioritizedIdentifier object to a dictionary representation suitable + for JSON serialization. + """ + return { + "isbn": self.identifier, + "priority": self.priority.name, + "stage_import": self.stage_import, + "timestamp": self.timestamp.isoformat(), + } + + def get_current_amazon_batch() -> Batch: """ At startup, get the Amazon openlibrary.core.imports.Batch() for global use. @@ -171,14 +237,41 @@ def get_pending_books(books): return pending_books -def process_amazon_batch(isbn_10s: list[str]) -> None: +def make_cache_key(product: dict[str, Any]) -> str: + """ + Takes a `product` returned from `vendor.get_products()` and returns a cache key to + identify the product. For a given product, the cache key will be either (1) its + ISBN 13, or (2) it's non-ISBN 10 ASIN (i.e. one that starts with `B`). + """ + if (isbn_13s := product.get("isbn_13")) and len(isbn_13s): + return isbn_13s[0] + + if product.get("isbn_10") and ( + cache_key := isbn_10_to_isbn_13(product.get("isbn_10", [])[0]) + ): + return cache_key + + if (source_records := product.get("source_records")) and ( + amazon_record := next( + (record for record in source_records if record.startswith("amazon:")), "" + ) + ): + return amazon_record.split(":")[1] + + return "" + + +def process_amazon_batch(asins: Collection[PrioritizedIdentifier]) -> None: """ - Call the Amazon API to get the products for a list of isbn_10s and store - each product in memcache using amazon_product_{isbn_13} as the cache key. + Call the Amazon API to get the products for a list of isbn_10s/ASINs and store + each product in memcache using amazon_product_{isbn_13 or b_asin} as the cache key. """ - logger.info(f"process_amazon_batch(): {len(isbn_10s)} items") + logger.info(f"process_amazon_batch(): {len(asins)} items") try: - products = web.amazon_api.get_products(isbn_10s, serialize=True) + identifiers = [ + prioritized_identifier.identifier for prioritized_identifier in asins + ] + products = web.amazon_api.get_products(identifiers, serialize=True) # stats_ol_affiliate_amazon_imports - Open Library - Dashboards - Grafana # http://graphite.us.archive.org Metrics.stats.ol... stats.increment( @@ -186,15 +279,11 @@ def process_amazon_batch(isbn_10s: list[str]) -> None: n=len(products), ) except Exception: - logger.exception(f"amazon_api.get_products({isbn_10s}, serialize=True)") + logger.exception(f"amazon_api.get_products({asins}, serialize=True)") return for product in products: - cache_key = ( # Make sure we have an isbn_13 for cache key - product.get('isbn_13') - and product.get('isbn_13')[0] - or isbn_10_to_isbn_13(product.get('isbn_10')[0]) - ) + cache_key = make_cache_key(product) # isbn_13 or non-ISBN-10 ASIN. cache.memcache_cache.set( # Add each product to memcache f'amazon_product_{cache_key}', product, expires=WEEK_SECS ) @@ -204,7 +293,16 @@ def process_amazon_batch(isbn_10s: list[str]) -> None: logger.debug("DB parameters missing from affiliate-server infobase") return - books = [clean_amazon_metadata_for_load(product) for product in products] + # Skip staging no_import_identifiers for for import by checking AMZ source record. + no_import_identifiers = { + identifier.identifier for identifier in asins if not identifier.stage_import + } + + books = [ + clean_amazon_metadata_for_load(product) + for product in products + if product.get("source_records")[0].split(":")[1] not in no_import_identifiers + ] if books: stats.increment( @@ -234,20 +332,18 @@ def amazon_lookup(site, stats_client, logger) -> None: while True: start_time = time.time() - isbn_10s: set[str] = set() # no duplicates in the batch - while len(isbn_10s) < API_MAX_ITEMS_PER_CALL and seconds_remaining(start_time): + asins: set[PrioritizedIdentifier] = set() # no duplicates in the batch + while len(asins) < API_MAX_ITEMS_PER_CALL and seconds_remaining(start_time): try: # queue.get() will block (sleep) until successful or it times out - isbn_10s.add( - web.amazon_queue.get(timeout=seconds_remaining(start_time)).isbn - ) + asins.add(web.amazon_queue.get(timeout=seconds_remaining(start_time))) except queue.Empty: pass - logger.info(f"Before amazon_lookup(): {len(isbn_10s)} items") - if isbn_10s: + logger.info(f"Before amazon_lookup(): {len(asins)} items") + if asins: time.sleep(seconds_remaining(start_time)) try: - process_amazon_batch(list(isbn_10s)) - logger.info(f"After amazon_lookup(): {len(isbn_10s)} items") + process_amazon_batch(asins) + logger.info(f"After amazon_lookup(): {len(asins)} items") except Exception: logger.exception("Amazon Lookup Thread died") stats_client.incr("ol.affiliate.amazon.lookup_thread_died") @@ -290,110 +386,68 @@ def GET(self) -> str: return json.dumps({"Cleared": "True", "qsize": qsize}) -class Priority(Enum): - """ - Priority for the `PrioritizedISBN` class. - - `queue.PriorityQueue` has a lowest-value-is-highest-priority system, but - setting `PrioritizedISBN.priority` to 0 can make it look as if priority is - disabled. Using an `Enum` can help with that. - """ - - HIGH = 0 - LOW = 1 - - def __lt__(self, other): - if isinstance(other, Priority): - return self.value < other.value - return NotImplemented - - -@dataclass(order=True, slots=True) -class PrioritizedISBN: - """ - Represent an ISBN's priority in the queue. Sorting is based on the `priority` - attribute, then the `timestamp` to solve tie breaks within a specific priority, - with priority going to whatever `min([items])` would return. - For more, see https://docs.python.org/3/library/queue.html#queue.PriorityQueue. - - Therefore, priority 0, which is equivalent to `Priority.HIGH`, is the highest - priority. - - This exists so certain ISBNs can go to the front of the queue for faster - processing as their look-ups are time sensitive and should return look up data - to the caller (e.g. interactive API usage through `/isbn`). - """ - - isbn: str = field(compare=False) - priority: Priority = field(default=Priority.LOW) - timestamp: datetime = field(default_factory=datetime.now) - - def to_dict(self): - """ - Convert the PrioritizedISBN object to a dictionary representation suitable - for JSON serialization. +class Submit: + def GET(self, identifier: str) -> str: """ - return { - "isbn": self.isbn, - "priority": self.priority.name, - "timestamp": self.timestamp.isoformat(), - } + GET endpoint looking up ISBNs and B* ASINs via the affiliate server. + URL Parameters: + - high_priority='true' or 'false': whether to wait and return result. + - stage_import='true' or 'false': whether to stage result for import. + By default this is 'true'. Setting this to 'false' is useful when you + want to return AMZ metadata but don't want to import; therefore it is + high_priority=true must also be 'true', or this returns nothing and + stages nothing (unless the result is cached). -class Submit: - @classmethod - def unpack_isbn(cls, isbn) -> tuple[str, str]: - isbn = normalize_isbn(isbn.replace('-', '')) - isbn10 = ( - isbn - if len(isbn) == 10 - else isbn.startswith('978') and isbn_13_to_isbn_10(isbn) - ) - isbn13 = isbn if len(isbn) == 13 else isbn_10_to_isbn_13(isbn) - return isbn10, isbn13 + If `identifier` is in memcache, then return the `hit` (which is marshalled + into a format appropriate for import on Open Library if `?high_priority=true`). - def GET(self, isbn: str) -> str: - """ - If `isbn` is in memcache, then return the `hit` (which is marshaled into - a format appropriate for import on Open Library if `?high_priority=true`). + By default `stage_import=true`, and results will be staged for import if they have + requisite fields. Disable staging with `stage_import=false`. - If no hit, then queue the isbn for look up and either attempt to return + If no hit, then queue the identifier for look up and either attempt to return a promise as `submitted`, or if `?high_priority=true`, return marshalled data from the cache. `Priority.HIGH` is set when `?high_priority=true` and is the highest priority. It is used when the caller is waiting for a response with the AMZ data, if - available. See `PrioritizedISBN` for more on prioritization. + available. See `PrioritizedIdentifier` for more on prioritization. + + NOTE: For this API, "ASINs" are ISBN 10s when valid ISBN 10s, and otherwise + they are Amazon-specific identifiers starting with "B". """ # cache could be None if reached before initialized (mypy) if not web.amazon_api: return json.dumps({"error": "not_configured"}) - isbn10, isbn13 = self.unpack_isbn(isbn) - if not isbn10: - return json.dumps( - {"error": "rejected_isbn", "isbn10": isbn10, "isbn13": isbn13} - ) + b_asin, isbn_10, isbn_13 = normalize_identifier(identifier) + if not (key := isbn_10 or b_asin): + return json.dumps({"error": "rejected_isbn", "identifier": identifier}) - input = web.input(high_priority=False) + # Handle URL query parameters. + input = web.input(high_priority=False, stage_import=True) priority = ( Priority.HIGH if input.get("high_priority") == "true" else Priority.LOW ) + stage_import = input.get("stage_import") != "false" - # Cache lookup by isbn13. If there's a hit return the product to the caller - if product := cache.memcache_cache.get(f'amazon_product_{isbn13}'): + # Cache lookup by isbn_13 or b_asin. If there's a hit return the product to + # the caller. + if product := cache.memcache_cache.get(f'amazon_product_{isbn_13 or b_asin}'): return json.dumps( { "status": "success", - "hit": product, + "hit": clean_amazon_metadata_for_load(product), } ) - # Cache misses will be submitted to Amazon as ASINs (isbn10) and the response - # will be `staged` for import. - if isbn10 not in web.amazon_queue.queue: - isbn10_queue_item = PrioritizedISBN(isbn=isbn10, priority=priority) - web.amazon_queue.put_nowait(isbn10_queue_item) + # Cache misses will be submitted to Amazon as ASINs (isbn10 if possible, or + # a 'true' ASIN otherwise) and the response will be `staged` for import. + if key not in web.amazon_queue.queue: + key_queue_item = PrioritizedIdentifier( + identifier=key, priority=priority, stage_import=stage_import + ) + web.amazon_queue.put_nowait(key_queue_item) # Give us a snapshot over time of how many new isbns are currently queued stats.put( @@ -407,13 +461,24 @@ def GET(self, isbn: str) -> str: if priority == Priority.HIGH: for _ in range(RETRIES): time.sleep(1) - if product := cache.memcache_cache.get(f'amazon_product_{isbn13}'): + if product := cache.memcache_cache.get( + f'amazon_product_{isbn_13 or b_asin}' + ): + # If not importing, return whatever data AMZ returns, even if it's unimportable. cleaned_metadata = clean_amazon_metadata_for_load(product) + if not stage_import: + return json.dumps( + {"status": "success", "hit": cleaned_metadata} + ) + + # When importing, return a result only if the item can be imported. source, pid = cleaned_metadata['source_records'][0].split(":") if ImportItem.find_staged_or_pending( identifiers=[pid], sources=[source] ): - return json.dumps({"status": "success", "hit": product}) + return json.dumps( + {"status": "success", "hit": cleaned_metadata} + ) stats.increment("ol.affiliate.amazon.total_items_not_found") return json.dumps({"status": "not found"}) diff --git a/scripts/manage-imports.py b/scripts/manage-imports.py index 4a3f106658a..d5b6417d78d 100755 --- a/scripts/manage-imports.py +++ b/scripts/manage-imports.py @@ -7,7 +7,6 @@ import sys import time -import _init_path # Imported for its side effect of setting PYTHONPATH import web from openlibrary.api import OLError, OpenLibrary diff --git a/scripts/promise_batch_imports.py b/scripts/promise_batch_imports.py index c3aeaf8ce6c..345e7096b79 100644 --- a/scripts/promise_batch_imports.py +++ b/scripts/promise_batch_imports.py @@ -16,6 +16,7 @@ from __future__ import annotations import json +from typing import Any import ijson from urllib.parse import urlencode import requests @@ -25,6 +26,7 @@ from infogami import config from openlibrary.config import load_config from openlibrary.core.imports import Batch, ImportItem +from openlibrary.core.vendors import get_amazon_metadata from scripts.solr_builder.solr_builder.fn_to_cli import FnToCLI @@ -87,6 +89,31 @@ def is_isbn_13(isbn: str): return isbn and isbn[0].isdigit() +def stage_b_asins_for_import(olbooks: list[dict[str, Any]]) -> None: + """ + Stage B* ASINs for import via BookWorm. + + This is so additional metadata may be used during import via load(), which + will look for `staged` rows in `import_item` and supplement `????` or otherwise + empty values. + """ + for book in olbooks: + if not (amazon := book.get('identifiers', {}).get('amazon', [])): + continue + + asin = amazon[0] + if asin.upper().startswith("B"): + try: + get_amazon_metadata( + id_=asin, + id_type="asin", + ) + + except requests.exceptions.ConnectionError: + logger.exception("Affiliate Server unreachable") + continue + + def batch_import(promise_id, batch_size=1000, dry_run=False): url = "https://archive.org/download/" date = promise_id.split("_")[-1] @@ -95,16 +122,23 @@ def batch_import(promise_id, batch_size=1000, dry_run=False): map_book_to_olbook(book, promise_id) for book in ijson.items(resp.raw, 'item') ) + # Note: dry_run won't include BookWorm data. if dry_run: for book in olbooks_gen: print(json.dumps(book), flush=True) return olbooks = list(olbooks_gen) + + # Stage B* ASINs for import so as to supplement their metadata via `load()`. + stage_b_asins_for_import(olbooks) + batch = Batch.find(promise_id) or Batch.new(promise_id) # Find just-in-time import candidates: - jit_candidates = [book['isbn_13'][0] for book in olbooks if book.get('isbn_13', [])] - ImportItem.bulk_mark_pending(jit_candidates) + if jit_candidates := [ + book['isbn_13'][0] for book in olbooks if book.get('isbn_13', []) + ]: + ImportItem.bulk_mark_pending(jit_candidates) batch_items = [{'ia_id': b['local_id'][0], 'data': b} for b in olbooks] for i in range(0, len(batch_items), batch_size): batch.add_items(batch_items[i : i + batch_size]) diff --git a/scripts/tests/test_affiliate_server.py b/scripts/tests/test_affiliate_server.py index c2b45bee494..1bbc730ebe3 100644 --- a/scripts/tests/test_affiliate_server.py +++ b/scripts/tests/test_affiliate_server.py @@ -7,19 +7,23 @@ import json import sys +from typing import Any from unittest.mock import MagicMock +import pytest + # TODO: Can we remove _init_path someday :( sys.modules['_init_path'] = MagicMock() - from openlibrary.mocks.mock_infobase import mock_site # noqa: F401 from scripts.affiliate_server import ( # noqa: E402 - PrioritizedISBN, + PrioritizedIdentifier, Priority, + Submit, get_isbns_from_book, get_isbns_from_books, get_editions_for_books, get_pending_books, + make_cache_key, ) ol_editions = { @@ -124,14 +128,54 @@ def test_get_isbns_from_books(): ] -def test_prioritized_isbn_can_serialize_to_json() -> None: +def test_prioritized_identifier_equality_set_uniqueness() -> None: """ - `PrioritizedISBN` needs to be be serializable to JSON because it is sometimes - called in, e.g. `json.dumps()`. + `PrioritizedIdentifier` is unique in a set when no other class instance + in the set has the same identifier. """ - p_isbn = PrioritizedISBN(isbn="1111111111", priority=Priority.HIGH) - dumped_isbn = json.dumps(p_isbn.to_dict()) - dict_isbn = json.loads(dumped_isbn) + identifier_1 = PrioritizedIdentifier(identifier="1111111111") + identifier_2 = PrioritizedIdentifier(identifier="2222222222") + + set_one = set() + set_one.update([identifier_1, identifier_1]) + assert len(set_one) == 1 + + set_two = set() + set_two.update([identifier_1, identifier_2]) + assert len(set_two) == 2 - assert dict_isbn["priority"] == "HIGH" - assert isinstance(dict_isbn["timestamp"], str) + +def test_prioritized_identifier_serialize_to_json() -> None: + """ + `PrioritizedIdentifier` needs to be be serializable to JSON because it is sometimes + called in, e.g. `json.dumps()`. + """ + p_identifier = PrioritizedIdentifier( + identifier="1111111111", priority=Priority.HIGH + ) + dumped_identifier = json.dumps(p_identifier.to_dict()) + dict_identifier = json.loads(dumped_identifier) + + assert dict_identifier["priority"] == "HIGH" + assert isinstance(dict_identifier["timestamp"], str) + + +@pytest.mark.parametrize( + ["isbn_or_asin", "expected_key"], + [ + ({"isbn_10": [], "isbn_13": ["9780747532699"]}, "9780747532699"), # Use 13. + ( + {"isbn_10": ["0747532699"], "source_records": ["amazon:B06XYHVXVJ"]}, + "9780747532699", + ), # 10 -> 13. + ( + {"isbn_10": [], "isbn_13": [], "source_records": ["amazon:B06XYHVXVJ"]}, + "B06XYHVXVJ", + ), # Get non-ISBN 10 ASIN from `source_records` if necessary. + ({"isbn_10": [], "isbn_13": [], "source_records": []}, ""), # Nothing to use. + ({}, ""), # Nothing to use. + ], +) +def test_make_cache_key(isbn_or_asin: dict[str, Any], expected_key: str) -> None: + got = make_cache_key(isbn_or_asin) + assert got == expected_key