www.fgks.org   »   [go: up one dir, main page]

Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Augment non-ISBN ASIN BWB records with BookWorm data #8903

Merged
Prev Previous commit
Next Next commit
Refactor: clarify variable names in affiliate_server.py
This commit (hopefully) clears up some of the nomenclature around ASINs,
ISBN 10s, and ISBN 13s.

The solution is to simply create whatever is available of `b_asin`,
`isbn_10`, and `isbn_13`, where `b_asin` is a `B*` ASIN from Amazon.

Then, a variable `key` is introduced, which is used for querying Amazon
via BookWorm, and it is either the value of `isbn_10`, or `b_asin` if
`isbn_10` doesn't exist.
  • Loading branch information
scottbarnes committed Apr 26, 2024
commit 7cf07f5ad65058eef17d48da7fc92a89e649e386
4 changes: 2 additions & 2 deletions openlibrary/plugins/importapi/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
LanguageMultipleMatchError,
get_location_and_publisher,
)
from openlibrary.utils.isbn import get_isbn_10_and_13
from openlibrary.utils.isbn import get_isbn_10s_and_13s

import web

Expand Down Expand Up @@ -344,7 +344,7 @@ def get_ia_record(metadata: dict) -> dict:
if description:
d['description'] = description
if unparsed_isbns:
isbn_10, isbn_13 = get_isbn_10_and_13(unparsed_isbns)
isbn_10, isbn_13 = get_isbn_10s_and_13s(unparsed_isbns)
if isbn_10:
d['isbn_10'] = isbn_10
if isbn_13:
Expand Down
38 changes: 32 additions & 6 deletions openlibrary/utils/isbn.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def check_digit_13(isbn):
return str(r)


def isbn_13_to_isbn_10(isbn_13) -> str | None:
def isbn_13_to_isbn_10(isbn_13: str) -> str | None:
isbn_13 = canonical(isbn_13)
if (
len(isbn_13) != 13
Expand All @@ -49,14 +49,14 @@ def isbn_13_to_isbn_10(isbn_13) -> str | None:
return isbn_13[3:-1] + check_digit_10(isbn_13[3:-1])


def isbn_10_to_isbn_13(isbn_10):
def isbn_10_to_isbn_13(isbn_10: str) -> str | None:
isbn_10 = canonical(isbn_10)
if (
len(isbn_10) != 10
or not isbn_10[:-1].isdigit()
or check_digit_10(isbn_10[:-1]) != isbn_10[-1]
):
return
return None
isbn_13 = '978' + isbn_10[:-1]
return isbn_13 + check_digit_13(isbn_13)

Expand Down Expand Up @@ -85,17 +85,43 @@ def normalize_isbn(isbn: str) -> str | None:
return isbn and canonical(isbn) or None


def get_isbn_10_and_13(isbns: str | list[str]) -> tuple[list[str], list[str]]:
def get_isbn_10_and_13(isbn: str) -> tuple[str | None, str | None]:
"""
Takes an ISBN 10 or 13 and returns an ISBN optional ISBN 10 and an ISBN 13,
both in canonical form.
"""
if canonical_isbn := normalize_isbn(isbn):
isbn_13 = (
canonical_isbn if len(canonical_isbn) == 13 else isbn_10_to_isbn_13(isbn)
)
isbn_10 = isbn_13_to_isbn_10(isbn_13) if isbn_13 else canonical_isbn
return isbn_10, isbn_13

return None, None


def normalize_identifier(
identifier: str,
) -> tuple[str | None, str | None, str | None]:
"""
Takes an identifier (e.g. an ISBN 10/13 or B* ASIN) and returns a tuple of:
ASIN, ISBN_10, ISBN_13 or None, with the ISBNs in canonical form.
"""
asin = identifier.upper() if identifier.upper().startswith("B") else None
return asin, *get_isbn_10_and_13(identifier)


def get_isbn_10s_and_13s(isbns: str | list[str]) -> tuple[list[str], list[str]]:
"""
Returns a tuple of list[isbn_10_strings], list[isbn_13_strings]

Internet Archive stores ISBNs in a a string, or a list of strings,
with no differentiation between ISBN 10 and ISBN 13. Open Library
records need ISBNs in `isbn_10` and `isbn_13` fields.

>>> get_isbn_10_and_13('1576079457')
>>> get_isbn_10s_and_13s('1576079457')
(['1576079457'], [])
>>> get_isbn_10_and_13(['1576079457', '9781576079454', '1576079392'])
>>> get_isbn_10s_and_13s(['1576079457', '9781576079454', '1576079392'])
(['1576079457', '1576079392'], ['9781576079454'])

Notes:
Expand Down
49 changes: 40 additions & 9 deletions openlibrary/utils/tests/test_isbn.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import pytest
from openlibrary.utils.isbn import (
get_isbn_10_and_13,
isbn_10_to_isbn_13,
isbn_13_to_isbn_10,
normalize_identifier,
normalize_isbn,
opposite_isbn,
get_isbn_10_and_13,
get_isbn_10s_and_13s,
)


Expand Down Expand Up @@ -50,33 +52,62 @@ def test_normalize_isbn(isbnlike, expected):
assert normalize_isbn(isbnlike) == expected


def test_get_isbn_10_and_13() -> None:
def test_get_isbn_10s_and_13s() -> None:
# isbn 10 only
result = get_isbn_10_and_13(["1576079457"])
result = get_isbn_10s_and_13s(["1576079457"])
assert result == (["1576079457"], [])

# isbn 13 only
result = get_isbn_10_and_13(["9781576079454"])
result = get_isbn_10s_and_13s(["9781576079454"])
assert result == ([], ["9781576079454"])

# mixed isbn 10 and 13, with multiple elements in each, one which has an extra space.
result = get_isbn_10_and_13(
result = get_isbn_10s_and_13s(
["9781576079454", "1576079457", "1576079392 ", "9781280711190"]
)
assert result == (["1576079457", "1576079392"], ["9781576079454", "9781280711190"])

# an empty list
result = get_isbn_10_and_13([])
result = get_isbn_10s_and_13s([])
assert result == ([], [])

# not an isbn
result = get_isbn_10_and_13(["flop"])
result = get_isbn_10s_and_13s(["flop"])
assert result == ([], [])

# isbn 10 string, with an extra space.
result = get_isbn_10_and_13(" 1576079457")
result = get_isbn_10s_and_13s(" 1576079457")
assert result == (["1576079457"], [])

# isbn 13 string
result = get_isbn_10_and_13("9781280711190")
result = get_isbn_10s_and_13s("9781280711190")
assert result == ([], ["9781280711190"])


@pytest.mark.parametrize(
["isbn", "expected"],
[
("1111111111", ("1111111111", "9781111111113")),
("9781111111113", ("1111111111", "9781111111113")),
("979-1-23456-789-6", (None, "9791234567896")),
("", (None, None)),
(None, (None, None)),
],
)
def test_get_isbn_10_and_13(isbn, expected) -> None:
got = get_isbn_10_and_13(isbn)
assert got == expected


@pytest.mark.parametrize(
["identifier", "expected"],
[
("B01234678", ("B01234678", None, None)),
("1111111111", (None, "1111111111", "9781111111113")),
("9781111111113", (None, "1111111111", "9781111111113")),
("", (None, None, None)),
],
)
def test_normalize_identifier(identifier, expected) -> None:
got = normalize_identifier(identifier)
assert got == expected
93 changes: 31 additions & 62 deletions scripts/affiliate_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
import threading
import time

from collections.abc import Iterable
from collections.abc import Collection
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
Expand All @@ -60,6 +60,7 @@
from openlibrary.core.vendors import AmazonAPI, clean_amazon_metadata_for_load
from openlibrary.utils.dateutil import WEEK_SECS
from openlibrary.utils.isbn import (
normalize_identifier,
normalize_isbn,
isbn_13_to_isbn_10,
isbn_10_to_isbn_13,
Expand Down Expand Up @@ -260,16 +261,15 @@ def make_cache_key(product: dict[str, Any]) -> str:
return ""


def process_amazon_batch(isbn_10s_or_asins: Iterable[PrioritizedIdentifier]) -> None:
def process_amazon_batch(asins: Collection[PrioritizedIdentifier]) -> None:
"""
Call the Amazon API to get the products for a list of isbn_10s and store
each product in memcache using amazon_product_{isbn_13} as the cache key.
Call the Amazon API to get the products for a list of isbn_10s/ASINs and store
each product in memcache using amazon_product_{isbn_13 or b_asin} as the cache key.
"""
logger.info(f"process_amazon_batch(): {len(isbn_10s_or_asins)} items")
logger.info(f"process_amazon_batch(): {len(asins)} items")
try:
identifiers = [
prioritized_identifier.identifier
for prioritized_identifier in isbn_10s_or_asins
prioritized_identifier.identifier for prioritized_identifier in asins
]
products = web.amazon_api.get_products(identifiers, serialize=True)
# stats_ol_affiliate_amazon_imports - Open Library - Dashboards - Grafana
Expand All @@ -279,9 +279,7 @@ def process_amazon_batch(isbn_10s_or_asins: Iterable[PrioritizedIdentifier]) ->
n=len(products),
)
except Exception:
logger.exception(
f"amazon_api.get_products({isbn_10s_or_asins}, serialize=True)"
)
logger.exception(f"amazon_api.get_products({asins}, serialize=True)")
return

for product in products:
Expand All @@ -297,10 +295,9 @@ def process_amazon_batch(isbn_10s_or_asins: Iterable[PrioritizedIdentifier]) ->

# Skip staging no_import_identifiers for for import by checking AMZ source record.
no_import_identifiers = {
identifier.identifier
for identifier in isbn_10s_or_asins
if not identifier.stage_import
identifier.identifier for identifier in asins if not identifier.stage_import
}

books = [
clean_amazon_metadata_for_load(product)
for product in products
Expand Down Expand Up @@ -335,24 +332,18 @@ def amazon_lookup(site, stats_client, logger) -> None:

while True:
start_time = time.time()
isbn_10s_or_asins: set[PrioritizedIdentifier] = (
set()
) # no duplicates in the batch
while len(isbn_10s_or_asins) < API_MAX_ITEMS_PER_CALL and seconds_remaining(
start_time
):
asins: set[PrioritizedIdentifier] = set() # no duplicates in the batch
while len(asins) < API_MAX_ITEMS_PER_CALL and seconds_remaining(start_time):
try: # queue.get() will block (sleep) until successful or it times out
isbn_10s_or_asins.add(
web.amazon_queue.get(timeout=seconds_remaining(start_time))
)
asins.add(web.amazon_queue.get(timeout=seconds_remaining(start_time)))
except queue.Empty:
pass
logger.info(f"Before amazon_lookup(): {len(isbn_10s_or_asins)} items")
if isbn_10s_or_asins:
logger.info(f"Before amazon_lookup(): {len(asins)} items")
if asins:
time.sleep(seconds_remaining(start_time))
try:
process_amazon_batch(isbn_10s_or_asins)
logger.info(f"After amazon_lookup(): {len(isbn_10s_or_asins)} items")
process_amazon_batch(asins)
logger.info(f"After amazon_lookup(): {len(asins)} items")
except Exception:
logger.exception("Amazon Lookup Thread died")
stats_client.incr("ol.affiliate.amazon.lookup_thread_died")
Expand Down Expand Up @@ -396,28 +387,7 @@ def GET(self) -> str:


class Submit:
@classmethod
def unpack_isbn(cls, isbn) -> tuple[str, str]:
"""
Return an "ASIN" and an ISBN 13 if possible. Here, an "ASIN" is an
ISBN 10 if the input is an ISBN, and an Amazon-specific ASIN starting
with "B" if it's not an ISBN 10.
"""
# If the "isbn" is an Amazon-specific ASIN, return it.
if len(isbn) == 10 and isbn.startswith("B"):
return (isbn, "")

# Get ISBN 10 and 13.
isbn = normalize_isbn(isbn.replace('-', ''))
isbn10 = (
isbn
if len(isbn) == 10
else isbn.startswith('978') and isbn_13_to_isbn_10(isbn)
)
isbn13 = isbn if len(isbn) == 13 else isbn_10_to_isbn_13(isbn)
return isbn10, isbn13

def GET(self, isbn_or_asin: str) -> str:
def GET(self, identifier: str) -> str:
"""
GET endpoint looking up ISBNs and B* ASINs via the affiliate server.

Expand All @@ -429,13 +399,13 @@ def GET(self, isbn_or_asin: str) -> str:
high_priority=true must also be 'true', or this returns nothing and
stages nothing (unless the result is cached).

If `isbn_or_asin` is in memcache, then return the `hit` (which is marshalled
If `identifier` is in memcache, then return the `hit` (which is marshalled
into a format appropriate for import on Open Library if `?high_priority=true`).

By default `stage_import=true`, and results will be staged for import if they have
requisite fields. Disable staging with `stage_import=false`.

If no hit, then queue the isbn_or_asin for look up and either attempt to return
If no hit, then queue the identifier for look up and either attempt to return
a promise as `submitted`, or if `?high_priority=true`, return marshalled data
from the cache.

Expand All @@ -450,21 +420,20 @@ def GET(self, isbn_or_asin: str) -> str:
if not web.amazon_api:
return json.dumps({"error": "not_configured"})

asin, isbn13 = self.unpack_isbn(isbn_or_asin)
if not asin:
return json.dumps(
{"error": "rejected_isbn", "asin": asin, "isbn13": isbn13}
)
b_asin, isbn_10, isbn_13 = normalize_identifier(identifier)
if not (key := isbn_10 or b_asin):
return json.dumps({"error": "rejected_isbn", "identifier": identifier})

# Handle URL query parameters.
input = web.input(high_priority=False, stage_import=True)
priority = (
Priority.HIGH if input.get("high_priority") == "true" else Priority.LOW
)
stage_import = input.get("stage_import") != "false"

# Cache lookup by isbn13 or asin. If there's a hit return the product to
# Cache lookup by isbn_13 or b_asin. If there's a hit return the product to
# the caller.
if product := cache.memcache_cache.get(f'amazon_product_{isbn13 or asin}'):
if product := cache.memcache_cache.get(f'amazon_product_{isbn_13 or b_asin}'):
return json.dumps(
{
"status": "success",
Expand All @@ -474,11 +443,11 @@ def GET(self, isbn_or_asin: str) -> str:

# Cache misses will be submitted to Amazon as ASINs (isbn10 if possible, or
# a 'true' ASIN otherwise) and the response will be `staged` for import.
if asin not in web.amazon_queue.queue:
asin_queue_item = PrioritizedIdentifier(
identifier=asin, priority=priority, stage_import=stage_import
if key not in web.amazon_queue.queue:
key_queue_item = PrioritizedIdentifier(
identifier=key, priority=priority, stage_import=stage_import
)
web.amazon_queue.put_nowait(asin_queue_item)
web.amazon_queue.put_nowait(key_queue_item)

# Give us a snapshot over time of how many new isbns are currently queued
stats.put(
Expand All @@ -493,7 +462,7 @@ def GET(self, isbn_or_asin: str) -> str:
for _ in range(RETRIES):
time.sleep(1)
if product := cache.memcache_cache.get(
f'amazon_product_{isbn13 or asin}'
f'amazon_product_{isbn_13 or b_asin}'
):
# If not importing, return whatever data AMZ returns, even if it's unimportable.
cleaned_metadata = clean_amazon_metadata_for_load(product)
Expand Down
19 changes: 3 additions & 16 deletions scripts/tests/test_affiliate_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,9 @@ def test_prioritized_identifier_serialize_to_json() -> None:
`PrioritizedIdentifier` needs to be be serializable to JSON because it is sometimes
called in, e.g. `json.dumps()`.
"""
p_identifier = PrioritizedIdentifier(identifier="1111111111", priority=Priority.HIGH)
p_identifier = PrioritizedIdentifier(
identifier="1111111111", priority=Priority.HIGH
)
dumped_identifier = json.dumps(p_identifier.to_dict())
dict_identifier = json.loads(dumped_identifier)

Expand All @@ -177,18 +179,3 @@ def test_prioritized_identifier_serialize_to_json() -> None:
def test_make_cache_key(isbn_or_asin: dict[str, Any], expected_key: str) -> None:
got = make_cache_key(isbn_or_asin)
assert got == expected_key


@pytest.mark.parametrize(
["isbn_or_asin", "expected"],
[
("0123456789", ("0123456789", "9780123456786")),
("0-.123456789", ("0123456789", "9780123456786")),
("9780123456786", ("0123456789", "9780123456786")),
("9-.780123456786", ("0123456789", "9780123456786")),
("B012346789", ("B012346789", "")),
],
)
def test_unpack_isbn(isbn_or_asin, expected) -> None:
got = Submit.unpack_isbn(isbn_or_asin)
assert got == expected