diff --git a/src/cobib/commands/add.py b/src/cobib/commands/add.py index 8664a91c3da946a097d35bcf093fef7c51ef02fc..65e16df150f99b44332ef532a3639347f4777805 100644 --- a/src/cobib/commands/add.py +++ b/src/cobib/commands/add.py @@ -15,6 +15,7 @@ cobib add --bibtex some_biblatex_file.bib cobib add --arxiv cobib add --doi cobib add --isbn +cobib add --url cobib add --yaml some_cobib_style_yaml_file.yaml ``` @@ -127,6 +128,7 @@ class AddCommand(Command): args: a sequence of additional arguments used for the execution. The following values are allowed for this command: * `-l`, `--label`: the label to give to the new entry. + * `-u`, `--update`: updates an existing database entry if it already exists. * `-f`, `--file`: one or multiple files to associate with this entry. This data will be stored in the `cobib.database.Entry.file` property. * `-p`, `--path`: the path to store the downloaded associated file in. This can @@ -170,9 +172,15 @@ class AddCommand(Command): cls.name: cls for _, cls in inspect.getmembers(parsers) if inspect.isclass(cls) } for name in avail_parsers.keys(): - group_add.add_argument( - f"-{name[0]}", f"--{name}", type=str, help=f"{name} object identfier" - ) + try: + group_add.add_argument( + f"-{name[0]}", f"--{name}", type=str, help=f"{name} object identfier" + ) + except argparse.ArgumentError: + try: + group_add.add_argument(f"--{name}", type=str, help=f"{name} object identfier") + except argparse.ArgumentError: + continue parser.add_argument( "tags", nargs=argparse.REMAINDER, diff --git a/src/cobib/parsers/__init__.py b/src/cobib/parsers/__init__.py index f99cae07898ce4cd7d62804f9bbded37e50fc082..f232c50b7e53d2635273343837ebf288981a2717 100644 --- a/src/cobib/parsers/__init__.py +++ b/src/cobib/parsers/__init__.py @@ -9,4 +9,5 @@ from .arxiv import ArxivParser as ArxivParser from .bibtex import BibtexParser as BibtexParser from .doi import DOIParser as DOIParser from .isbn import ISBNParser as ISBNParser +from .url import URLParser as URLParser from .yaml import YAMLParser as YAMLParser diff --git a/src/cobib/parsers/arxiv.py b/src/cobib/parsers/arxiv.py index a0fa4da3f290b1ce03058b7fdd604c6e3b8a2a28..5606b2a88c119b6261e41461f4ecb7069f272a2a 100644 --- a/src/cobib/parsers/arxiv.py +++ b/src/cobib/parsers/arxiv.py @@ -6,6 +6,8 @@ It gathers the BibTex-encoded data from the arXiv API and parses the raw XML dat Since v3.2.0 coBib will also automatically download the PDF version of the new entry. You can configure the default download location via `config.utils.file_downloader.default_location`. +Since v3.3.0 this parser even supports URLs from which an arXiv ID can be extracted directly. + The parser is registered under the `-a` and `--arxiv` command-line arguments of the `cobib.commands.add.AddCommand`. @@ -27,23 +29,34 @@ from .base_parser import Parser LOGGER = logging.getLogger(__name__) +ARXIV_URL = "https://export.arxiv.org/api/query?id_list=" +"""arXiv exporting URL taken from [here](https://arxiv.org/help/oa).""" +ARXIV_REGEX = r"(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?\/\d{7})(v\d+)?" +"""A regex pattern used to match valid DOIs.""" + class ArxivParser(Parser): """The arXiv Parser.""" name = "arxiv" - ARXIV_URL = "https://export.arxiv.org/api/query?id_list=" - """arXiv exporting URL taken from [here](https://arxiv.org/help/oa).""" - def parse(self, string: str) -> Dict[str, Entry]: # pdoc will inherit the docstring from the base class # noqa: D102 - LOGGER.info("Gathering BibTex data for arXiv ID: %s.", string) try: - page = requests.get(self.ARXIV_URL + string, timeout=10) + match = re.search(ARXIV_REGEX, string) + if match is None: + raise AssertionError + except AssertionError: + msg = f"'{string}' is not a valid arXiv ID." + LOGGER.warning(msg) + return OrderedDict() + arxiv_id = match.group(1) + LOGGER.info("Gathering BibTex data for arXiv ID: %s.", arxiv_id) + try: + page = requests.get(ARXIV_URL + arxiv_id, timeout=10) except requests.exceptions.RequestException as err: - LOGGER.error("An Exception occurred while trying to query the arXiv ID: %s.", string) + LOGGER.error("An Exception occurred while trying to query the arXiv ID: %s.", arxiv_id) LOGGER.error(err) return OrderedDict() xml = BeautifulSoup(page.text, features="html.parser") diff --git a/src/cobib/parsers/doi.py b/src/cobib/parsers/doi.py index a68465074a792fd7adec709a678dbde409ee5933..be79c45d22c83e8ec207a5b82d344b4e5f7e4b25 100644 --- a/src/cobib/parsers/doi.py +++ b/src/cobib/parsers/doi.py @@ -13,6 +13,8 @@ Furthermore, you should look into the `config.utils.file_downloader.url_map` set you tell coBib how to map from journal landing page URLs to the corresponding PDF URLs. For more information check out `cobib.config.example` and the man-page. +Since v3.3.0 this parser even supports URLs from which a DOI can be extracted directly. + The parser is registered under the `-d` and `--doi` command-line arguments of the `cobib.commands.add.AddCommand`. @@ -34,35 +36,38 @@ from .bibtex import BibtexParser LOGGER = logging.getLogger(__name__) +DOI_URL = "https://doi.org/" +"""The DOI 'API' URL.""" +DOI_HEADER = {"Accept": "application/x-bibtex"} +"""The DOI 'API' header taken from [here](https://crosscite.org/docs.html).""" +DOI_REGEX = r'(10\.[0-9a-zA-Z]+\/(?:(?!["&\'])\S)+)\b' +"""A regex pattern used to match valid DOIs.""" + class DOIParser(Parser): """The DOI Parser.""" name = "doi" - DOI_URL = "https://doi.org/" - """The DOI 'API' URL.""" - DOI_HEADER = {"Accept": "application/x-bibtex"} - """The DOI 'API' header taken from [here](https://crosscite.org/docs.html).""" - DOI_REGEX = r'(10\.[0-9a-zA-Z]+\/(?:(?!["&\'])\S)+)\b' - """A regex pattern used to match valid DOIs.""" - def parse(self, string: str) -> Dict[str, Entry]: # pdoc will inherit the docstring from the base class # noqa: D102 try: - assert re.match(self.DOI_REGEX, string) + match = re.search(DOI_REGEX, string) + if match is None: + raise AssertionError except AssertionError: msg = f"'{string}' is not a valid DOI." LOGGER.warning(msg) return OrderedDict() - LOGGER.info("Gathering BibTex data for DOI: %s.", string) + doi = match.group(1) + LOGGER.info("Gathering BibTex data for DOI: %s.", doi) try: - page = requests.get(self.DOI_URL + string, headers=self.DOI_HEADER, timeout=10) + page = requests.get(DOI_URL + doi, headers=DOI_HEADER, timeout=10) # this assumes that the doi.org page redirects to the correct journal's landing page - redirected_url = requests.head(self.DOI_URL + string, timeout=1).headers["Location"] + redirected_url = requests.head(DOI_URL + doi, timeout=1).headers["Location"] except requests.exceptions.RequestException as err: - LOGGER.error("An Exception occurred while trying to query the DOI: %s.", string) + LOGGER.error("An Exception occurred while trying to query the DOI: %s.", doi) LOGGER.error(err) return OrderedDict() bib = BibtexParser().parse(page.text) diff --git a/src/cobib/parsers/isbn.py b/src/cobib/parsers/isbn.py index 271384385b6387594070aa42bb8ed0340a04a3d2..460ff3c51b64ce005d098c7447acee06e016253a 100644 --- a/src/cobib/parsers/isbn.py +++ b/src/cobib/parsers/isbn.py @@ -7,6 +7,8 @@ Note, that the openlibrary API does not contain all ISBNs and potential server e by the parser. In the future, I hope to make the API backend configurable. +Since v3.3.0 this parser also supports URLs from which an ISBN can be extracted directly. + The parser is registered under the `-i` and `--isbn` command-line arguments of the `cobib.commands.add.AddCommand`. @@ -28,30 +30,38 @@ from .base_parser import Parser LOGGER = logging.getLogger(__name__) +ISBN_URL = "https://openlibrary.org/api/books?bibkeys=ISBN:" +"""ISBN API URL taken from [here](https://openlibrary.org/dev/docs/api/books).""" +ISBN_REGEX = re.compile( + r"(97[89]{1}(?:-?\d){10}|\d{9}[0-9X]{1}|[-0-9X]{10,16})", re.I | re.M | re.S +) +"""A regex pattern used to match valid ISBNs. Adapted from +[here](https://github.com/xlcnd/isbnlib).""" + class ISBNParser(Parser): """The ISBN Parser.""" name = "isbn" - ISBN_URL = "https://openlibrary.org/api/books?bibkeys=ISBN:" - """ISBN API URL taken from [here](https://openlibrary.org/dev/docs/api/books).""" - ISBN_REGEX = re.compile( - r"97[89]{1}(?:-?\d){10}|\d{9}[0-9X]{1}|" r"[-0-9X]{10,16}", re.I | re.M | re.S - ) - """A regex pattern used to match valid ISBNs. Adapted from - [here](https://github.com/xlcnd/isbnlib).""" - def parse(self, string: str) -> Dict[str, Entry]: # pdoc will inherit the docstring from the base class # noqa: D102 - assert re.match(self.ISBN_REGEX, string) - LOGGER.info("Gathering BibTex data for ISBN: %s.", string) - isbn_plain = "".join([i for i in string if i.isdigit()]) try: - page = requests.get(self.ISBN_URL + isbn_plain + "&jscmd=data&format=json", timeout=10) + match = re.search(ISBN_REGEX, string) + if match is None: + raise AssertionError + except AssertionError: + msg = f"'{string}' is not a valid ISBN." + LOGGER.warning(msg) + return OrderedDict() + isbn = match.group(1) + LOGGER.info("Gathering BibTex data for ISBN: %s.", isbn) + isbn_plain = "".join([i for i in isbn if i.isdigit()]) + try: + page = requests.get(ISBN_URL + isbn_plain + "&jscmd=data&format=json", timeout=10) except requests.exceptions.RequestException as err: - LOGGER.error("An Exception occurred while trying to query the ISBN: %s.", string) + LOGGER.error("An Exception occurred while trying to query the ISBN: %s.", isbn) LOGGER.error(err) return OrderedDict() try: @@ -62,7 +72,7 @@ class ISBNParser(Parser): return OrderedDict() if not contents: msg = ( - f'No data was found for ISBN "{string}". If you think this is an error and ' + f'No data was found for ISBN "{isbn}". If you think this is an error and ' + "the openlibrary API should provide an entry, please file a bug report. " + "Otherwise please try adding this entry manually until more APIs are " + "available in coBib." diff --git a/src/cobib/parsers/url.py b/src/cobib/parsers/url.py new file mode 100644 index 0000000000000000000000000000000000000000..343fdf8ee104ce7d9a5b85a5fa23ffed87cf4f64 --- /dev/null +++ b/src/cobib/parsers/url.py @@ -0,0 +1,90 @@ +"""coBib's URL parser. + +This parser is capable of generating `cobib.database.Entry` instances from general URLs. +This parser checks the URL for contained arXiv IDs, DOIs, and ISBNs in this order. +If none of the above match (or fail to return an `Entry`) it will fall back and extract all DOIs +from the website which the URL is pointing to. It will then use the most common DOI (if it occurs +more often than once) and use that as the DOI to which this URL supposedly redirects. + +The parser is registered under the `--url` command-line argument of the +`cobib.commands.add.AddCommand`. + +The following documentation is mostly inherited from the abstract interface +`cobib.parsers.base_parser`. +""" + +import logging +import re +from collections import Counter, OrderedDict +from typing import Dict + +import requests + +from cobib.database import Entry + +from .arxiv import ARXIV_REGEX, ArxivParser +from .base_parser import Parser +from .doi import DOI_REGEX, DOIParser +from .isbn import ISBN_REGEX, ISBNParser + +LOGGER = logging.getLogger(__name__) + + +class URLParser(Parser): + # pylint: disable=too-many-return-statements + """The URL Parser.""" + + name = "url" + + def parse(self, string: str) -> Dict[str, Entry]: + # pdoc will inherit the docstring from the base class + # noqa: D102 + + if re.search(ARXIV_REGEX, string): + LOGGER.debug("URL contains an arXiv ID") + entries = ArxivParser().parse(string) + if entries: + LOGGER.debug("Successfully extracted metadata from URL with ArxivParser") + return entries + if re.search(DOI_REGEX, string): + LOGGER.debug("URL contains a DOI") + entries = DOIParser().parse(string) + if entries: + LOGGER.debug("Successfully extracted metadata from URL with DOIParser") + return entries + if re.search(ISBN_REGEX, string): + LOGGER.debug("URL contains an ISBN") + entries = ISBNParser().parse(string) + if entries: + LOGGER.debug("Successfully extracted metadata from URL with ISBNParser") + return entries + + try: + page = requests.get(string, timeout=10) + except requests.exceptions.RequestException as err: + LOGGER.error("An Exception occurred while trying to query the URL: %s.", string) + LOGGER.error(err) + return OrderedDict() + + LOGGER.debug("Falling back to determining most common DOI in URLs page contents") + matches = re.findall(DOI_REGEX, page.text) + dois = Counter(matches) + if not dois: + LOGGER.error("Could not find any DOIs on the URLs page: %s", string) + return OrderedDict() + # we assume the most common DOI on the page is the one which we are looking for + most_common_doi = dois.most_common(1)[0] + LOGGER.debug("Most common DOI is: %s", most_common_doi) + if most_common_doi[1] > 1: + entries = DOIParser().parse(most_common_doi[0]) + + if entries: + LOGGER.debug("Successfully extracted metadata from most common DOI") + return entries + + LOGGER.error("Could not extract metadata from URL: %s", string) + return OrderedDict() + + def dump(self, entry: Entry) -> None: + """We cannot dump a generic entry as a URL.""" + LOGGER.error("Cannot dump an entry as a URL.") diff --git a/tests/parsers/test_arxiv.py b/tests/parsers/test_arxiv.py index 00e2195ec8445e2e7bb6259cc65877c208739a7d..d914a7414b0be9fce6717ff6cc73fea0e8ce66d2 100644 --- a/tests/parsers/test_arxiv.py +++ b/tests/parsers/test_arxiv.py @@ -12,16 +12,41 @@ from cobib.database import Entry from .parser_test import ParserTest +def assert_default_test_entry(entry: Entry) -> None: + """Asserts that the passed entry is the default testing entry. + + Args: + entry: the entry to assert. + """ + entry.escape_special_chars() + assert entry.label == "Cao2018" + assert entry.data["archivePrefix"] == "arXiv" + assert entry.data["arxivid"].startswith("1812.09976") + assert ( + entry.data["author"] + == "Yudong Cao and Jonathan Romero and Jonathan P. Olson and Matthias Degroote and " + + "Peter D. Johnson and M{\\'a}ria Kieferov{\\'a} and Ian D. Kivlichan and Tim Menke " + + "and Borja Peropadre and Nicolas P. D. Sawaya and Sukin Sim and Libor Veis and " + + "Al{\\'a}n Aspuru-Guzik" + ) + assert entry.data["doi"].startswith("10.1021/acs.chemrev.8b00803") + assert entry.data["title"] == "Quantum Chemistry in the Age of Quantum Computing" + assert entry.data["year"] == 2018 + assert entry.data["_download"] == "http://arxiv.org/pdf/1812.09976v2" + + class TestArxivParser(ParserTest): """Tests for coBib's ArxivParser.""" - def test_from_arxiv(self, caplog: pytest.LogCaptureFixture) -> None: + @pytest.mark.parametrize("query", ["1812.09976", "https://arxiv.org/abs/1812.09976"]) + def test_from_arxiv(self, query: str, caplog: pytest.LogCaptureFixture) -> None: """Test parsing from arXiv. Args: + query: the arXiv ID or URL which to query. caplog: the built-in pytest fixture. """ - entries = parsers.ArxivParser().parse("1812.09976") + entries = parsers.ArxivParser().parse(query) if ( "cobib.parsers.arxiv", @@ -31,21 +56,7 @@ class TestArxivParser(ParserTest): pytest.skip("The requests API encountered an error. Skipping test.") entry = list(entries.values())[0] - entry.escape_special_chars() - assert entry.label == "Cao2018" - assert entry.data["archivePrefix"] == "arXiv" - assert entry.data["arxivid"].startswith("1812.09976") - assert ( - entry.data["author"] - == "Yudong Cao and Jonathan Romero and Jonathan P. Olson and Matthias Degroote and " - + "Peter D. Johnson and M{\\'a}ria Kieferov{\\'a} and Ian D. Kivlichan and Tim Menke " - + "and Borja Peropadre and Nicolas P. D. Sawaya and Sukin Sim and Libor Veis and " - + "Al{\\'a}n Aspuru-Guzik" - ) - assert entry.data["doi"].startswith("10.1021/acs.chemrev.8b00803") - assert entry.data["title"] == "Quantum Chemistry in the Age of Quantum Computing" - assert entry.data["year"] == 2018 - assert entry.data["_download"] == "http://arxiv.org/pdf/1812.09976v2" + assert_default_test_entry(entry) # regression test for https://gitlab.com/mrossinek/cobib/-/issues/57 def test_invalid_arxiv_id(self) -> None: @@ -83,12 +94,12 @@ class TestArxivParser(ParserTest): raise requests.exceptions.RequestException() monkeypatch.setattr(requests, "get", raise_exception) - parsers.ArxivParser().parse("dummy") + parsers.ArxivParser().parse("1812.0997") assert ( "cobib.parsers.arxiv", logging.ERROR, - "An Exception occurred while trying to query the arXiv ID: dummy.", + "An Exception occurred while trying to query the arXiv ID: 1812.0997.", ) in caplog.record_tuples def test_dump(self, caplog: pytest.LogCaptureFixture) -> None: diff --git a/tests/parsers/test_doi.py b/tests/parsers/test_doi.py index 7bea1c55404580e496c885c726df750ad6176928..a1e18e4e9fc818c8cbb21a66db51b854eeeca169 100644 --- a/tests/parsers/test_doi.py +++ b/tests/parsers/test_doi.py @@ -12,22 +12,35 @@ from cobib.database import Entry from .parser_test import ParserTest +def assert_default_test_entry(entry: Entry) -> None: + """Asserts that the passed entry is the default testing entry. + + Args: + entry: the entry to assert. + """ + reference = ParserTest.EXAMPLE_ENTRY_DICT.copy() + # In this specific case the bib file provided by this DOI includes additional (yet + # unnecessary) brackets in the escaped special characters of the author field. Thus, we + # correct for this inconsistency manually before asserting the equality. + reference["author"] = str(reference["author"]).replace("'a", "'{a}") + reference["_download"] = "https://pubs.acs.org/doi/10.1021/acs.chemrev.8b00803" + assert entry.data == reference + + class TestDOIParser(ParserTest): """Tests for coBib's DOIParser.""" - def test_from_doi(self, caplog: pytest.LogCaptureFixture) -> None: + @pytest.mark.parametrize( + "query", ["10.1021/acs.chemrev.8b00803", "https://doi.org/10.1021/acs.chemrev.8b00803"] + ) + def test_from_doi(self, query: str, caplog: pytest.LogCaptureFixture) -> None: """Test parsing from DOI. Args: + query: the arXiv ID or URL which to query. caplog: the built-in pytest fixture. """ - reference = self.EXAMPLE_ENTRY_DICT.copy() - # In this specific case the bib file provided by this DOI includes additional (yet - # unnecessary) brackets in the escaped special characters of the author field. Thus, we - # correct for this inconsistency manually before asserting the equality. - reference["author"] = str(reference["author"]).replace("'a", "'{a}") - reference["_download"] = "https://pubs.acs.org/doi/10.1021/acs.chemrev.8b00803" - entries = parsers.DOIParser().parse("10.1021/acs.chemrev.8b00803") + entries = parsers.DOIParser().parse(query) if ( "cobib.parsers.doi", @@ -37,7 +50,7 @@ class TestDOIParser(ParserTest): pytest.skip("The requests API encountered an error. Skipping test.") entry = list(entries.values())[0] - assert entry.data == reference + assert_default_test_entry(entry) def test_invalid_doi(self, caplog: pytest.LogCaptureFixture) -> None: """Test parsing an invalid DOI. diff --git a/tests/parsers/test_url.py b/tests/parsers/test_url.py new file mode 100644 index 0000000000000000000000000000000000000000..6cb394db86e5cfedbfe593ba816b9b18d04c6c24 --- /dev/null +++ b/tests/parsers/test_url.py @@ -0,0 +1,88 @@ +"""Tests for coBib's URLParser.""" +# pylint: disable=no-self-use,unused-argument + +import logging +from typing import Callable + +import pytest + +from cobib import parsers +from cobib.database import Entry + +from .parser_test import ParserTest +from .test_arxiv import assert_default_test_entry as assert_arxiv_entry +from .test_doi import assert_default_test_entry as assert_doi_entry + + +def assert_default_test_entry(entry: Entry) -> None: + """Asserts that the passed entry is the default testing entry. + + Args: + entry: the entry to assert. + """ + entry.escape_special_chars() + assert entry.label == "Grimsley_2019" + assert entry.data["doi"] == "10.1038/s41467-019-10988-2" + assert entry.data["url"] == ["https://doi.org/10.1038%2Fs41467-019-10988-2"] + assert entry.data["year"] == 2019 + assert entry.data["month"] == "jul" + assert entry.data["publisher"] == "Springer Science and Business Media {LLC}" + assert entry.data["volume"] == 10 + assert entry.data["number"] == 1 + assert ( + entry.data["author"] + == "Harper R. Grimsley and Sophia E. Economou and Edwin Barnes and Nicholas J. Mayhall" + ) + assert ( + entry.data["title"] + == "An adaptive variational algorithm for exact molecular simulations on a quantum computer" + ) + assert entry.data["journal"] == "Nature Communications" + + +class TestURLParser(ParserTest): + """Tests for coBib's URLParser.""" + + @pytest.mark.parametrize( + ("query", "assertion"), + [ + ("https://arxiv.org/abs/1812.09976", assert_arxiv_entry), + ("https://doi.org/10.1021/acs.chemrev.8b00803", assert_doi_entry), + ("https://www.nature.com/articles/s41467-019-10988-2", assert_default_test_entry), + ], + ) + def test_from_url( + self, query: str, assertion: Callable[[Entry], None], caplog: pytest.LogCaptureFixture + ) -> None: + """Test parsing from arXiv. + + Args: + query: the URL which to query. + assertion: the assertion method to run. + caplog: the built-in pytest fixture. + """ + entries = parsers.URLParser().parse(query) + + entry = list(entries.values())[0] + assertion(entry) + + def test_invalid_url(self) -> None: + """Test parsing an invalid URL.""" + entries = parsers.URLParser().parse("https://github.com/") + assert not entries + assert entries == {} + + def test_dump(self, caplog: pytest.LogCaptureFixture) -> None: + """Test dumping. + + Args: + caplog: the built-in pytest fixture. + """ + entry = Entry("dummy", {"ENTRYTYPE": "unpublished"}) + parsers.URLParser().dump(entry) + + assert ( + "cobib.parsers.url", + logging.ERROR, + "Cannot dump an entry as a URL.", + ) in caplog.record_tuples