Refactor: clarify variable names in affiliate_server.py

This commit (hopefully) clears up some of the nomenclature around ASINs, ISBN 10s, and ISBN 13s. The solution is to simply create whatever is available of `b_asin`, `isbn_10`, and `isbn_13`, where `b_asin` is a `B*` ASIN from Amazon. Then, a variable `key` is introduced, which is used for querying Amazon via BookWorm, and it is either the value of `isbn_10`, or `b_asin` if `isbn_10` doesn't exist.
internetarchive · mekarpeles · Apr 26, 2024 · Mar 12, 2024 · Mar 13, 2024 · Mar 16, 2024
commit 7cf07f5ad65058eef17d48da7fc92a89e649e386
diff --git a/openlibrary/plugins/importapi/code.py b/openlibrary/plugins/importapi/code.py
@@ -18,7 +18,7 @@
     LanguageMultipleMatchError,
     get_location_and_publisher,
 )
-from openlibrary.utils.isbn import get_isbn_10_and_13
+from openlibrary.utils.isbn import get_isbn_10s_and_13s
 
 import web
 
@@ -344,7 +344,7 @@ def get_ia_record(metadata: dict) -> dict:
         if description:
             d['description'] = description
         if unparsed_isbns:
-            isbn_10, isbn_13 = get_isbn_10_and_13(unparsed_isbns)
+            isbn_10, isbn_13 = get_isbn_10s_and_13s(unparsed_isbns)
             if isbn_10:
                 d['isbn_10'] = isbn_10
             if isbn_13:

diff --git a/openlibrary/utils/isbn.py b/openlibrary/utils/isbn.py
@@ -37,7 +37,7 @@ def check_digit_13(isbn):
         return str(r)
 
 
-def isbn_13_to_isbn_10(isbn_13) -> str | None:
+def isbn_13_to_isbn_10(isbn_13: str) -> str | None:
     isbn_13 = canonical(isbn_13)
     if (
         len(isbn_13) != 13
@@ -49,14 +49,14 @@ def isbn_13_to_isbn_10(isbn_13) -> str | None:
     return isbn_13[3:-1] + check_digit_10(isbn_13[3:-1])
 
 
-def isbn_10_to_isbn_13(isbn_10):
+def isbn_10_to_isbn_13(isbn_10: str) -> str | None:
     isbn_10 = canonical(isbn_10)
     if (
         len(isbn_10) != 10
         or not isbn_10[:-1].isdigit()
         or check_digit_10(isbn_10[:-1]) != isbn_10[-1]
     ):
-        return
+        return None
     isbn_13 = '978' + isbn_10[:-1]
     return isbn_13 + check_digit_13(isbn_13)
 
@@ -85,17 +85,43 @@ def normalize_isbn(isbn: str) -> str | None:
     return isbn and canonical(isbn) or None
 
 
-def get_isbn_10_and_13(isbns: str | list[str]) -> tuple[list[str], list[str]]:
+def get_isbn_10_and_13(isbn: str) -> tuple[str | None, str | None]:
+    """
+    Takes an ISBN 10 or 13 and returns an ISBN optional ISBN 10 and an ISBN 13,
+    both in canonical form.
+    """
+    if canonical_isbn := normalize_isbn(isbn):
+        isbn_13 = (
+            canonical_isbn if len(canonical_isbn) == 13 else isbn_10_to_isbn_13(isbn)
+        )
+        isbn_10 = isbn_13_to_isbn_10(isbn_13) if isbn_13 else canonical_isbn
+        return isbn_10, isbn_13
+
+    return None, None
+
+
+def normalize_identifier(
+    identifier: str,
+) -> tuple[str | None, str | None, str | None]:
+    """
+    Takes an identifier (e.g. an ISBN 10/13 or B* ASIN) and returns a tuple of:
+        ASIN, ISBN_10, ISBN_13 or None, with the ISBNs in canonical form.
+    """
+    asin = identifier.upper() if identifier.upper().startswith("B") else None
+    return asin, *get_isbn_10_and_13(identifier)
+
+
+def get_isbn_10s_and_13s(isbns: str | list[str]) -> tuple[list[str], list[str]]:
     """
     Returns a tuple of list[isbn_10_strings], list[isbn_13_strings]
 
     Internet Archive stores ISBNs in a a string, or a list of strings,
     with no differentiation between ISBN 10 and ISBN 13. Open Library
     records need ISBNs in `isbn_10` and `isbn_13` fields.
 
-    >>> get_isbn_10_and_13('1576079457')
+    >>> get_isbn_10s_and_13s('1576079457')
     (['1576079457'], [])
-    >>> get_isbn_10_and_13(['1576079457', '9781576079454', '1576079392'])
+    >>> get_isbn_10s_and_13s(['1576079457', '9781576079454', '1576079392'])
     (['1576079457', '1576079392'], ['9781576079454'])
 
     Notes:

diff --git a/openlibrary/utils/tests/test_isbn.py b/openlibrary/utils/tests/test_isbn.py
@@ -1,10 +1,12 @@
 import pytest
 from openlibrary.utils.isbn import (
+    get_isbn_10_and_13,
     isbn_10_to_isbn_13,
     isbn_13_to_isbn_10,
+    normalize_identifier,
     normalize_isbn,
     opposite_isbn,
-    get_isbn_10_and_13,
+    get_isbn_10s_and_13s,
 )
 
 
@@ -50,33 +52,62 @@ def test_normalize_isbn(isbnlike, expected):
     assert normalize_isbn(isbnlike) == expected
 
 
-def test_get_isbn_10_and_13() -> None:
+def test_get_isbn_10s_and_13s() -> None:
     # isbn 10 only
-    result = get_isbn_10_and_13(["1576079457"])
+    result = get_isbn_10s_and_13s(["1576079457"])
     assert result == (["1576079457"], [])
 
     # isbn 13 only
-    result = get_isbn_10_and_13(["9781576079454"])
+    result = get_isbn_10s_and_13s(["9781576079454"])
     assert result == ([], ["9781576079454"])
 
     # mixed isbn 10 and 13, with multiple elements in each, one which has an extra space.
-    result = get_isbn_10_and_13(
+    result = get_isbn_10s_and_13s(
         ["9781576079454", "1576079457", "1576079392 ", "9781280711190"]
     )
     assert result == (["1576079457", "1576079392"], ["9781576079454", "9781280711190"])
 
     # an empty list
-    result = get_isbn_10_and_13([])
+    result = get_isbn_10s_and_13s([])
     assert result == ([], [])
 
     # not an isbn
-    result = get_isbn_10_and_13(["flop"])
+    result = get_isbn_10s_and_13s(["flop"])
     assert result == ([], [])
 
     # isbn 10 string, with an extra space.
-    result = get_isbn_10_and_13(" 1576079457")
+    result = get_isbn_10s_and_13s(" 1576079457")
     assert result == (["1576079457"], [])
 
     # isbn 13 string
-    result = get_isbn_10_and_13("9781280711190")
+    result = get_isbn_10s_and_13s("9781280711190")
     assert result == ([], ["9781280711190"])
+
+
+@pytest.mark.parametrize(
+    ["isbn", "expected"],
+    [
+        ("1111111111", ("1111111111", "9781111111113")),
+        ("9781111111113", ("1111111111", "9781111111113")),
+        ("979-1-23456-789-6", (None, "9791234567896")),
+        ("", (None, None)),
+        (None, (None, None)),
+    ],
+)
+def test_get_isbn_10_and_13(isbn, expected) -> None:
+    got = get_isbn_10_and_13(isbn)
+    assert got == expected
+
+
+@pytest.mark.parametrize(
+    ["identifier", "expected"],
+    [
+        ("B01234678", ("B01234678", None, None)),
+        ("1111111111", (None, "1111111111", "9781111111113")),
+        ("9781111111113", (None, "1111111111", "9781111111113")),
+        ("", (None, None, None)),
+    ],
+)
+def test_normalize_identifier(identifier, expected) -> None:
+    got = normalize_identifier(identifier)
+    assert got == expected
diff --git a/scripts/affiliate_server.py b/scripts/affiliate_server.py
@@ -42,7 +42,7 @@
 import threading
 import time
 
-from collections.abc import Iterable
+from collections.abc import Collection
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
@@ -60,6 +60,7 @@
 from openlibrary.core.vendors import AmazonAPI, clean_amazon_metadata_for_load
 from openlibrary.utils.dateutil import WEEK_SECS
 from openlibrary.utils.isbn import (
+    normalize_identifier,
     normalize_isbn,
     isbn_13_to_isbn_10,
     isbn_10_to_isbn_13,
@@ -260,16 +261,15 @@ def make_cache_key(product: dict[str, Any]) -> str:
     return ""
 
 
-def process_amazon_batch(isbn_10s_or_asins: Iterable[PrioritizedIdentifier]) -> None:
+def process_amazon_batch(asins: Collection[PrioritizedIdentifier]) -> None:
     """
-    Call the Amazon API to get the products for a list of isbn_10s and store
-    each product in memcache using amazon_product_{isbn_13} as the cache key.
+    Call the Amazon API to get the products for a list of isbn_10s/ASINs and store
+    each product in memcache using amazon_product_{isbn_13 or b_asin} as the cache key.
     """
-    logger.info(f"process_amazon_batch(): {len(isbn_10s_or_asins)} items")
+    logger.info(f"process_amazon_batch(): {len(asins)} items")
     try:
         identifiers = [
-            prioritized_identifier.identifier
-            for prioritized_identifier in isbn_10s_or_asins
+            prioritized_identifier.identifier for prioritized_identifier in asins
         ]
         products = web.amazon_api.get_products(identifiers, serialize=True)
         # stats_ol_affiliate_amazon_imports - Open Library - Dashboards - Grafana
@@ -279,9 +279,7 @@ def process_amazon_batch(isbn_10s_or_asins: Iterable[PrioritizedIdentifier]) ->
             n=len(products),
         )
     except Exception:
-        logger.exception(
-            f"amazon_api.get_products({isbn_10s_or_asins}, serialize=True)"
-        )
+        logger.exception(f"amazon_api.get_products({asins}, serialize=True)")
         return
 
     for product in products:
@@ -297,10 +295,9 @@ def process_amazon_batch(isbn_10s_or_asins: Iterable[PrioritizedIdentifier]) ->
 
     # Skip staging no_import_identifiers for for import by checking AMZ source record.
     no_import_identifiers = {
-        identifier.identifier
-        for identifier in isbn_10s_or_asins
-        if not identifier.stage_import
+        identifier.identifier for identifier in asins if not identifier.stage_import
     }
+
     books = [
         clean_amazon_metadata_for_load(product)
         for product in products
@@ -335,24 +332,18 @@ def amazon_lookup(site, stats_client, logger) -> None:
 
     while True:
         start_time = time.time()
-        isbn_10s_or_asins: set[PrioritizedIdentifier] = (
-            set()
-        )  # no duplicates in the batch
-        while len(isbn_10s_or_asins) < API_MAX_ITEMS_PER_CALL and seconds_remaining(
-            start_time
-        ):
+        asins: set[PrioritizedIdentifier] = set()  # no duplicates in the batch
+        while len(asins) < API_MAX_ITEMS_PER_CALL and seconds_remaining(start_time):
             try:  # queue.get() will block (sleep) until successful or it times out
-                isbn_10s_or_asins.add(
-                    web.amazon_queue.get(timeout=seconds_remaining(start_time))
-                )
+                asins.add(web.amazon_queue.get(timeout=seconds_remaining(start_time)))
             except queue.Empty:
                 pass
-        logger.info(f"Before amazon_lookup(): {len(isbn_10s_or_asins)} items")
-        if isbn_10s_or_asins:
+        logger.info(f"Before amazon_lookup(): {len(asins)} items")
+        if asins:
             time.sleep(seconds_remaining(start_time))
             try:
-                process_amazon_batch(isbn_10s_or_asins)
-                logger.info(f"After amazon_lookup(): {len(isbn_10s_or_asins)} items")
+                process_amazon_batch(asins)
+                logger.info(f"After amazon_lookup(): {len(asins)} items")
             except Exception:
                 logger.exception("Amazon Lookup Thread died")
                 stats_client.incr("ol.affiliate.amazon.lookup_thread_died")
@@ -396,28 +387,7 @@ def GET(self) -> str:
 
 
 class Submit:
-    @classmethod
-    def unpack_isbn(cls, isbn) -> tuple[str, str]:
-        """
-        Return an "ASIN" and an ISBN 13 if possible. Here, an "ASIN" is an
-        ISBN 10 if the input is an ISBN, and an Amazon-specific ASIN starting
-        with "B" if it's not an ISBN 10.
-        """
-        # If the "isbn" is an Amazon-specific ASIN, return it.
-        if len(isbn) == 10 and isbn.startswith("B"):
-            return (isbn, "")
-
-        # Get ISBN 10 and 13.
-        isbn = normalize_isbn(isbn.replace('-', ''))
-        isbn10 = (
-            isbn
-            if len(isbn) == 10
-            else isbn.startswith('978') and isbn_13_to_isbn_10(isbn)
-        )
-        isbn13 = isbn if len(isbn) == 13 else isbn_10_to_isbn_13(isbn)
-        return isbn10, isbn13
-
-    def GET(self, isbn_or_asin: str) -> str:
+    def GET(self, identifier: str) -> str:
         """
         GET endpoint looking up ISBNs and B* ASINs via the affiliate server.
 
@@ -429,13 +399,13 @@ def GET(self, isbn_or_asin: str) -> str:
               high_priority=true must also be 'true', or this returns nothing and
               stages nothing (unless the result is cached).
 
-        If `isbn_or_asin` is in memcache, then return the `hit` (which is marshalled
+        If `identifier` is in memcache, then return the `hit` (which is marshalled
         into a format appropriate for import on Open Library if `?high_priority=true`).
 
         By default `stage_import=true`, and results will be staged for import if they have
         requisite fields. Disable staging with `stage_import=false`.
 
-        If no hit, then queue the isbn_or_asin for look up and either attempt to return
+        If no hit, then queue the identifier for look up and either attempt to return
         a promise as `submitted`, or if `?high_priority=true`, return marshalled data
         from the cache.
 
@@ -450,21 +420,20 @@ def GET(self, isbn_or_asin: str) -> str:
         if not web.amazon_api:
             return json.dumps({"error": "not_configured"})
 
-        asin, isbn13 = self.unpack_isbn(isbn_or_asin)
-        if not asin:
-            return json.dumps(
-                {"error": "rejected_isbn", "asin": asin, "isbn13": isbn13}
-            )
+        b_asin, isbn_10, isbn_13 = normalize_identifier(identifier)
+        if not (key := isbn_10 or b_asin):
+            return json.dumps({"error": "rejected_isbn", "identifier": identifier})
 
+        # Handle URL query parameters.
         input = web.input(high_priority=False, stage_import=True)
         priority = (
             Priority.HIGH if input.get("high_priority") == "true" else Priority.LOW
         )
         stage_import = input.get("stage_import") != "false"
 
-        # Cache lookup by isbn13 or asin. If there's a hit return the product to
+        # Cache lookup by isbn_13 or b_asin. If there's a hit return the product to
         # the caller.
-        if product := cache.memcache_cache.get(f'amazon_product_{isbn13 or asin}'):
+        if product := cache.memcache_cache.get(f'amazon_product_{isbn_13 or b_asin}'):
             return json.dumps(
                 {
                     "status": "success",
@@ -474,11 +443,11 @@ def GET(self, isbn_or_asin: str) -> str:
 
         # Cache misses will be submitted to Amazon as ASINs (isbn10 if possible, or
         # a 'true' ASIN otherwise) and the response will be `staged` for import.
-        if asin not in web.amazon_queue.queue:
-            asin_queue_item = PrioritizedIdentifier(
-                identifier=asin, priority=priority, stage_import=stage_import
+        if key not in web.amazon_queue.queue:
+            key_queue_item = PrioritizedIdentifier(
+                identifier=key, priority=priority, stage_import=stage_import
             )
-            web.amazon_queue.put_nowait(asin_queue_item)
+            web.amazon_queue.put_nowait(key_queue_item)
 
         # Give us a snapshot over time of how many new isbns are currently queued
         stats.put(
@@ -493,7 +462,7 @@ def GET(self, isbn_or_asin: str) -> str:
             for _ in range(RETRIES):
                 time.sleep(1)
                 if product := cache.memcache_cache.get(
-                    f'amazon_product_{isbn13 or asin}'
+                    f'amazon_product_{isbn_13 or b_asin}'
                 ):
                     # If not importing, return whatever data AMZ returns, even if it's unimportable.
                     cleaned_metadata = clean_amazon_metadata_for_load(product)

diff --git a/scripts/tests/test_affiliate_server.py b/scripts/tests/test_affiliate_server.py
@@ -150,7 +150,9 @@ def test_prioritized_identifier_serialize_to_json() -> None:
     `PrioritizedIdentifier` needs to be be serializable to JSON because it is sometimes
     called in, e.g. `json.dumps()`.
     """
-    p_identifier = PrioritizedIdentifier(identifier="1111111111", priority=Priority.HIGH)
+    p_identifier = PrioritizedIdentifier(
+        identifier="1111111111", priority=Priority.HIGH
+    )
     dumped_identifier = json.dumps(p_identifier.to_dict())
     dict_identifier = json.loads(dumped_identifier)
 
@@ -177,18 +179,3 @@ def test_prioritized_identifier_serialize_to_json() -> None:
 def test_make_cache_key(isbn_or_asin: dict[str, Any], expected_key: str) -> None:
     got = make_cache_key(isbn_or_asin)
     assert got == expected_key
-
-
-@pytest.mark.parametrize(
-    ["isbn_or_asin", "expected"],
-    [
-        ("0123456789", ("0123456789", "9780123456786")),
-        ("0-.123456789", ("0123456789", "9780123456786")),
-        ("9780123456786", ("0123456789", "9780123456786")),
-        ("9-.780123456786", ("0123456789", "9780123456786")),
-        ("B012346789", ("B012346789", "")),
-    ],
-)
-def test_unpack_isbn(isbn_or_asin, expected) -> None:
-    got = Submit.unpack_isbn(isbn_or_asin)
-    assert got == expected