From d476f315904b814d3aa2523f590da8cc17240c17 Mon Sep 17 00:00:00 2001 From: Max Rossmannek Date: Sat, 25 Sep 2021 12:07:46 +0200 Subject: [PATCH 01/11] Support full arXiv URLs in ArxivParser --- src/cobib/parsers/arxiv.py | 17 ++++++++++++++--- tests/parsers/test_arxiv.py | 12 ++++++++---- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/cobib/parsers/arxiv.py b/src/cobib/parsers/arxiv.py index a0fa4da3..e37d5169 100644 --- a/src/cobib/parsers/arxiv.py +++ b/src/cobib/parsers/arxiv.py @@ -35,15 +35,26 @@ class ArxivParser(Parser): ARXIV_URL = "https://export.arxiv.org/api/query?id_list=" """arXiv exporting URL taken from [here](https://arxiv.org/help/oa).""" + ARXIV_REGEX = r'(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?\/\d{7})(v\d+)?' + """A regex pattern used to match valid DOIs.""" def parse(self, string: str) -> Dict[str, Entry]: # pdoc will inherit the docstring from the base class # noqa: D102 - LOGGER.info("Gathering BibTex data for arXiv ID: %s.", string) try: - page = requests.get(self.ARXIV_URL + string, timeout=10) + match = re.search(self.ARXIV_REGEX, string) + if match is None: + raise AssertionError + except AssertionError: + msg = f"'{string}' is not a valid arXiv ID." + LOGGER.warning(msg) + return OrderedDict() + arxiv_id = match.group(1) + LOGGER.info("Gathering BibTex data for arXiv ID: %s.", arxiv_id) + try: + page = requests.get(self.ARXIV_URL + arxiv_id, timeout=10) except requests.exceptions.RequestException as err: - LOGGER.error("An Exception occurred while trying to query the arXiv ID: %s.", string) + LOGGER.error("An Exception occurred while trying to query the arXiv ID: %s.", arxiv_id) LOGGER.error(err) return OrderedDict() xml = BeautifulSoup(page.text, features="html.parser") diff --git a/tests/parsers/test_arxiv.py b/tests/parsers/test_arxiv.py index 00e2195e..8c77ea3a 100644 --- a/tests/parsers/test_arxiv.py +++ b/tests/parsers/test_arxiv.py @@ -15,13 +15,17 @@ from .parser_test import ParserTest class TestArxivParser(ParserTest): """Tests for coBib's ArxivParser.""" - def test_from_arxiv(self, caplog: pytest.LogCaptureFixture) -> None: + @pytest.mark.parametrize( + "query", ["1812.09976", "https://arxiv.org/abs/1812.09976"] + ) + def test_from_arxiv(self, query: str, caplog: pytest.LogCaptureFixture) -> None: """Test parsing from arXiv. Args: + query: the arXiv ID or URL which to query. caplog: the built-in pytest fixture. """ - entries = parsers.ArxivParser().parse("1812.09976") + entries = parsers.ArxivParser().parse(query) if ( "cobib.parsers.arxiv", @@ -83,12 +87,12 @@ class TestArxivParser(ParserTest): raise requests.exceptions.RequestException() monkeypatch.setattr(requests, "get", raise_exception) - parsers.ArxivParser().parse("dummy") + parsers.ArxivParser().parse("1812.0997") assert ( "cobib.parsers.arxiv", logging.ERROR, - "An Exception occurred while trying to query the arXiv ID: dummy.", + "An Exception occurred while trying to query the arXiv ID: 1812.0997.", ) in caplog.record_tuples def test_dump(self, caplog: pytest.LogCaptureFixture) -> None: -- GitLab From 9bdab820da973e27c34e1129af701e1f27f03a0d Mon Sep 17 00:00:00 2001 From: Max Rossmannek Date: Sat, 25 Sep 2021 12:14:41 +0200 Subject: [PATCH 02/11] Support full DOI URLs in DOIParser --- src/cobib/parsers/doi.py | 13 ++++++++----- tests/parsers/test_doi.py | 8 ++++++-- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/cobib/parsers/doi.py b/src/cobib/parsers/doi.py index a6846507..3fb179cb 100644 --- a/src/cobib/parsers/doi.py +++ b/src/cobib/parsers/doi.py @@ -51,18 +51,21 @@ class DOIParser(Parser): # pdoc will inherit the docstring from the base class # noqa: D102 try: - assert re.match(self.DOI_REGEX, string) + match = re.search(self.DOI_REGEX, string) + if match is None: + raise AssertionError except AssertionError: msg = f"'{string}' is not a valid DOI." LOGGER.warning(msg) return OrderedDict() - LOGGER.info("Gathering BibTex data for DOI: %s.", string) + doi = match.group(1) + LOGGER.info("Gathering BibTex data for DOI: %s.", doi) try: - page = requests.get(self.DOI_URL + string, headers=self.DOI_HEADER, timeout=10) + page = requests.get(self.DOI_URL + doi, headers=self.DOI_HEADER, timeout=10) # this assumes that the doi.org page redirects to the correct journal's landing page - redirected_url = requests.head(self.DOI_URL + string, timeout=1).headers["Location"] + redirected_url = requests.head(self.DOI_URL + doi, timeout=1).headers["Location"] except requests.exceptions.RequestException as err: - LOGGER.error("An Exception occurred while trying to query the DOI: %s.", string) + LOGGER.error("An Exception occurred while trying to query the DOI: %s.", doi) LOGGER.error(err) return OrderedDict() bib = BibtexParser().parse(page.text) diff --git a/tests/parsers/test_doi.py b/tests/parsers/test_doi.py index 7bea1c55..a6c8f49d 100644 --- a/tests/parsers/test_doi.py +++ b/tests/parsers/test_doi.py @@ -15,10 +15,14 @@ from .parser_test import ParserTest class TestDOIParser(ParserTest): """Tests for coBib's DOIParser.""" - def test_from_doi(self, caplog: pytest.LogCaptureFixture) -> None: + @pytest.mark.parametrize( + "query", ["10.1021/acs.chemrev.8b00803", "https://doi.org/10.1021/acs.chemrev.8b00803"] + ) + def test_from_doi(self, query: str, caplog: pytest.LogCaptureFixture) -> None: """Test parsing from DOI. Args: + query: the arXiv ID or URL which to query. caplog: the built-in pytest fixture. """ reference = self.EXAMPLE_ENTRY_DICT.copy() @@ -27,7 +31,7 @@ class TestDOIParser(ParserTest): # correct for this inconsistency manually before asserting the equality. reference["author"] = str(reference["author"]).replace("'a", "'{a}") reference["_download"] = "https://pubs.acs.org/doi/10.1021/acs.chemrev.8b00803" - entries = parsers.DOIParser().parse("10.1021/acs.chemrev.8b00803") + entries = parsers.DOIParser().parse(query) if ( "cobib.parsers.doi", -- GitLab From f7b7aa0d05a338bcb5f41f1071b338a2659cbc06 Mon Sep 17 00:00:00 2001 From: Max Rossmannek Date: Sat, 25 Sep 2021 12:27:15 +0200 Subject: [PATCH 03/11] Extract general regexes outside of parser classes --- src/cobib/parsers/arxiv.py | 14 +++++++------- src/cobib/parsers/doi.py | 20 ++++++++++---------- src/cobib/parsers/isbn.py | 19 +++++++++---------- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/src/cobib/parsers/arxiv.py b/src/cobib/parsers/arxiv.py index e37d5169..6615c1b8 100644 --- a/src/cobib/parsers/arxiv.py +++ b/src/cobib/parsers/arxiv.py @@ -27,22 +27,22 @@ from .base_parser import Parser LOGGER = logging.getLogger(__name__) +ARXIV_URL = "https://export.arxiv.org/api/query?id_list=" +"""arXiv exporting URL taken from [here](https://arxiv.org/help/oa).""" +ARXIV_REGEX = r'(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?\/\d{7})(v\d+)?' +"""A regex pattern used to match valid DOIs.""" + class ArxivParser(Parser): """The arXiv Parser.""" name = "arxiv" - ARXIV_URL = "https://export.arxiv.org/api/query?id_list=" - """arXiv exporting URL taken from [here](https://arxiv.org/help/oa).""" - ARXIV_REGEX = r'(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?\/\d{7})(v\d+)?' - """A regex pattern used to match valid DOIs.""" - def parse(self, string: str) -> Dict[str, Entry]: # pdoc will inherit the docstring from the base class # noqa: D102 try: - match = re.search(self.ARXIV_REGEX, string) + match = re.search(ARXIV_REGEX, string) if match is None: raise AssertionError except AssertionError: @@ -52,7 +52,7 @@ class ArxivParser(Parser): arxiv_id = match.group(1) LOGGER.info("Gathering BibTex data for arXiv ID: %s.", arxiv_id) try: - page = requests.get(self.ARXIV_URL + arxiv_id, timeout=10) + page = requests.get(ARXIV_URL + arxiv_id, timeout=10) except requests.exceptions.RequestException as err: LOGGER.error("An Exception occurred while trying to query the arXiv ID: %s.", arxiv_id) LOGGER.error(err) diff --git a/src/cobib/parsers/doi.py b/src/cobib/parsers/doi.py index 3fb179cb..f700fcc3 100644 --- a/src/cobib/parsers/doi.py +++ b/src/cobib/parsers/doi.py @@ -34,24 +34,24 @@ from .bibtex import BibtexParser LOGGER = logging.getLogger(__name__) +DOI_URL = "https://doi.org/" +"""The DOI 'API' URL.""" +DOI_HEADER = {"Accept": "application/x-bibtex"} +"""The DOI 'API' header taken from [here](https://crosscite.org/docs.html).""" +DOI_REGEX = r'(10\.[0-9a-zA-Z]+\/(?:(?!["&\'])\S)+)\b' +"""A regex pattern used to match valid DOIs.""" + class DOIParser(Parser): """The DOI Parser.""" name = "doi" - DOI_URL = "https://doi.org/" - """The DOI 'API' URL.""" - DOI_HEADER = {"Accept": "application/x-bibtex"} - """The DOI 'API' header taken from [here](https://crosscite.org/docs.html).""" - DOI_REGEX = r'(10\.[0-9a-zA-Z]+\/(?:(?!["&\'])\S)+)\b' - """A regex pattern used to match valid DOIs.""" - def parse(self, string: str) -> Dict[str, Entry]: # pdoc will inherit the docstring from the base class # noqa: D102 try: - match = re.search(self.DOI_REGEX, string) + match = re.search(DOI_REGEX, string) if match is None: raise AssertionError except AssertionError: @@ -61,9 +61,9 @@ class DOIParser(Parser): doi = match.group(1) LOGGER.info("Gathering BibTex data for DOI: %s.", doi) try: - page = requests.get(self.DOI_URL + doi, headers=self.DOI_HEADER, timeout=10) + page = requests.get(DOI_URL + doi, headers=DOI_HEADER, timeout=10) # this assumes that the doi.org page redirects to the correct journal's landing page - redirected_url = requests.head(self.DOI_URL + doi, timeout=1).headers["Location"] + redirected_url = requests.head(DOI_URL + doi, timeout=1).headers["Location"] except requests.exceptions.RequestException as err: LOGGER.error("An Exception occurred while trying to query the DOI: %s.", doi) LOGGER.error(err) diff --git a/src/cobib/parsers/isbn.py b/src/cobib/parsers/isbn.py index 27138438..eb1a64e2 100644 --- a/src/cobib/parsers/isbn.py +++ b/src/cobib/parsers/isbn.py @@ -28,28 +28,27 @@ from .base_parser import Parser LOGGER = logging.getLogger(__name__) +ISBN_URL = "https://openlibrary.org/api/books?bibkeys=ISBN:" +"""ISBN API URL taken from [here](https://openlibrary.org/dev/docs/api/books).""" +ISBN_REGEX = re.compile( + r"97[89]{1}(?:-?\d){10}|\d{9}[0-9X]{1}|" r"[-0-9X]{10,16}", re.I | re.M | re.S +) +"""A regex pattern used to match valid ISBNs. Adapted from [here](https://github.com/xlcnd/isbnlib).""" + class ISBNParser(Parser): """The ISBN Parser.""" name = "isbn" - ISBN_URL = "https://openlibrary.org/api/books?bibkeys=ISBN:" - """ISBN API URL taken from [here](https://openlibrary.org/dev/docs/api/books).""" - ISBN_REGEX = re.compile( - r"97[89]{1}(?:-?\d){10}|\d{9}[0-9X]{1}|" r"[-0-9X]{10,16}", re.I | re.M | re.S - ) - """A regex pattern used to match valid ISBNs. Adapted from - [here](https://github.com/xlcnd/isbnlib).""" - def parse(self, string: str) -> Dict[str, Entry]: # pdoc will inherit the docstring from the base class # noqa: D102 - assert re.match(self.ISBN_REGEX, string) + assert re.match(ISBN_REGEX, string) LOGGER.info("Gathering BibTex data for ISBN: %s.", string) isbn_plain = "".join([i for i in string if i.isdigit()]) try: - page = requests.get(self.ISBN_URL + isbn_plain + "&jscmd=data&format=json", timeout=10) + page = requests.get(ISBN_URL + isbn_plain + "&jscmd=data&format=json", timeout=10) except requests.exceptions.RequestException as err: LOGGER.error("An Exception occurred while trying to query the ISBN: %s.", string) LOGGER.error(err) -- GitLab From b0924a9519fb90e3fb82e8516e4cf67f224bbe91 Mon Sep 17 00:00:00 2001 From: Max Rossmannek Date: Sat, 25 Sep 2021 12:36:55 +0200 Subject: [PATCH 04/11] Black formatting --- src/cobib/parsers/arxiv.py | 2 +- src/cobib/parsers/isbn.py | 7 +++---- tests/parsers/test_arxiv.py | 4 +--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/cobib/parsers/arxiv.py b/src/cobib/parsers/arxiv.py index 6615c1b8..5bf25e1b 100644 --- a/src/cobib/parsers/arxiv.py +++ b/src/cobib/parsers/arxiv.py @@ -29,7 +29,7 @@ LOGGER = logging.getLogger(__name__) ARXIV_URL = "https://export.arxiv.org/api/query?id_list=" """arXiv exporting URL taken from [here](https://arxiv.org/help/oa).""" -ARXIV_REGEX = r'(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?\/\d{7})(v\d+)?' +ARXIV_REGEX = r"(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?\/\d{7})(v\d+)?" """A regex pattern used to match valid DOIs.""" diff --git a/src/cobib/parsers/isbn.py b/src/cobib/parsers/isbn.py index eb1a64e2..6a2b9c2c 100644 --- a/src/cobib/parsers/isbn.py +++ b/src/cobib/parsers/isbn.py @@ -30,10 +30,9 @@ LOGGER = logging.getLogger(__name__) ISBN_URL = "https://openlibrary.org/api/books?bibkeys=ISBN:" """ISBN API URL taken from [here](https://openlibrary.org/dev/docs/api/books).""" -ISBN_REGEX = re.compile( - r"97[89]{1}(?:-?\d){10}|\d{9}[0-9X]{1}|" r"[-0-9X]{10,16}", re.I | re.M | re.S -) -"""A regex pattern used to match valid ISBNs. Adapted from [here](https://github.com/xlcnd/isbnlib).""" +ISBN_REGEX = re.compile(r"97[89]{1}(?:-?\d){10}|\d{9}[0-9X]{1}|[-0-9X]{10,16}", re.I | re.M | re.S) +"""A regex pattern used to match valid ISBNs. Adapted from +[here](https://github.com/xlcnd/isbnlib).""" class ISBNParser(Parser): diff --git a/tests/parsers/test_arxiv.py b/tests/parsers/test_arxiv.py index 8c77ea3a..5e937922 100644 --- a/tests/parsers/test_arxiv.py +++ b/tests/parsers/test_arxiv.py @@ -15,9 +15,7 @@ from .parser_test import ParserTest class TestArxivParser(ParserTest): """Tests for coBib's ArxivParser.""" - @pytest.mark.parametrize( - "query", ["1812.09976", "https://arxiv.org/abs/1812.09976"] - ) + @pytest.mark.parametrize("query", ["1812.09976", "https://arxiv.org/abs/1812.09976"]) def test_from_arxiv(self, query: str, caplog: pytest.LogCaptureFixture) -> None: """Test parsing from arXiv. -- GitLab From 09c46d3b47a19697e039aaf5c4b8a27b0fb2875a Mon Sep 17 00:00:00 2001 From: Max Rossmannek Date: Sat, 25 Sep 2021 12:39:37 +0200 Subject: [PATCH 05/11] Apply same regex search to ISBN queries (although ISBN-URLs are unlikely?) --- src/cobib/parsers/isbn.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/cobib/parsers/isbn.py b/src/cobib/parsers/isbn.py index 6a2b9c2c..e7e466b4 100644 --- a/src/cobib/parsers/isbn.py +++ b/src/cobib/parsers/isbn.py @@ -30,7 +30,9 @@ LOGGER = logging.getLogger(__name__) ISBN_URL = "https://openlibrary.org/api/books?bibkeys=ISBN:" """ISBN API URL taken from [here](https://openlibrary.org/dev/docs/api/books).""" -ISBN_REGEX = re.compile(r"97[89]{1}(?:-?\d){10}|\d{9}[0-9X]{1}|[-0-9X]{10,16}", re.I | re.M | re.S) +ISBN_REGEX = re.compile( + r"(97[89]{1}(?:-?\d){10}|\d{9}[0-9X]{1}|[-0-9X]{10,16})", re.I | re.M | re.S +) """A regex pattern used to match valid ISBNs. Adapted from [here](https://github.com/xlcnd/isbnlib).""" @@ -43,13 +45,21 @@ class ISBNParser(Parser): def parse(self, string: str) -> Dict[str, Entry]: # pdoc will inherit the docstring from the base class # noqa: D102 - assert re.match(ISBN_REGEX, string) - LOGGER.info("Gathering BibTex data for ISBN: %s.", string) - isbn_plain = "".join([i for i in string if i.isdigit()]) + try: + match = re.search(ISBN_REGEX, string) + if match is None: + raise AssertionError + except AssertionError: + msg = f"'{string}' is not a valid ISBN." + LOGGER.warning(msg) + return OrderedDict() + isbn = match.group(1) + LOGGER.info("Gathering BibTex data for ISBN: %s.", isbn) + isbn_plain = "".join([i for i in isbn if i.isdigit()]) try: page = requests.get(ISBN_URL + isbn_plain + "&jscmd=data&format=json", timeout=10) except requests.exceptions.RequestException as err: - LOGGER.error("An Exception occurred while trying to query the ISBN: %s.", string) + LOGGER.error("An Exception occurred while trying to query the ISBN: %s.", isbn) LOGGER.error(err) return OrderedDict() try: @@ -60,7 +70,7 @@ class ISBNParser(Parser): return OrderedDict() if not contents: msg = ( - f'No data was found for ISBN "{string}". If you think this is an error and ' + f'No data was found for ISBN "{isbn}". If you think this is an error and ' + "the openlibrary API should provide an entry, please file a bug report. " + "Otherwise please try adding this entry manually until more APIs are " + "available in coBib." -- GitLab From 58d2dde6ed0cabeaabfe28f65b18df689093098c Mon Sep 17 00:00:00 2001 From: Max Rossmannek Date: Sat, 25 Sep 2021 13:01:50 +0200 Subject: [PATCH 06/11] Add crude URLParser --- src/cobib/commands/add.py | 12 +++++-- src/cobib/parsers/__init__.py | 1 + src/cobib/parsers/url.py | 43 ++++++++++++++++++++++++ tests/parsers/test_arxiv.py | 39 +++++++++++++--------- tests/parsers/test_doi.py | 23 +++++++++---- tests/parsers/test_url.py | 61 +++++++++++++++++++++++++++++++++++ 6 files changed, 154 insertions(+), 25 deletions(-) create mode 100644 src/cobib/parsers/url.py create mode 100644 tests/parsers/test_url.py diff --git a/src/cobib/commands/add.py b/src/cobib/commands/add.py index 8664a91c..55337fd8 100644 --- a/src/cobib/commands/add.py +++ b/src/cobib/commands/add.py @@ -170,9 +170,15 @@ class AddCommand(Command): cls.name: cls for _, cls in inspect.getmembers(parsers) if inspect.isclass(cls) } for name in avail_parsers.keys(): - group_add.add_argument( - f"-{name[0]}", f"--{name}", type=str, help=f"{name} object identfier" - ) + try: + group_add.add_argument( + f"-{name[0]}", f"--{name}", type=str, help=f"{name} object identfier" + ) + except argparse.ArgumentError: + try: + group_add.add_argument(f"--{name}", type=str, help=f"{name} object identfier") + except argparse.ArgumentError: + continue parser.add_argument( "tags", nargs=argparse.REMAINDER, diff --git a/src/cobib/parsers/__init__.py b/src/cobib/parsers/__init__.py index f99cae07..f232c50b 100644 --- a/src/cobib/parsers/__init__.py +++ b/src/cobib/parsers/__init__.py @@ -9,4 +9,5 @@ from .arxiv import ArxivParser as ArxivParser from .bibtex import BibtexParser as BibtexParser from .doi import DOIParser as DOIParser from .isbn import ISBNParser as ISBNParser +from .url import URLParser as URLParser from .yaml import YAMLParser as YAMLParser diff --git a/src/cobib/parsers/url.py b/src/cobib/parsers/url.py new file mode 100644 index 00000000..54f8ab11 --- /dev/null +++ b/src/cobib/parsers/url.py @@ -0,0 +1,43 @@ +"""coBib's URL parser. + +Todo: + - Documentation +""" + +import logging +import re +from collections import OrderedDict +from typing import Dict + +from cobib.database import Entry + +from .arxiv import ARXIV_REGEX, ArxivParser +from .base_parser import Parser +from .doi import DOI_REGEX, DOIParser +from .isbn import ISBN_REGEX, ISBNParser + +LOGGER = logging.getLogger(__name__) + + +class URLParser(Parser): + """The URL Parser.""" + + name = "url" + + def parse(self, string: str) -> Dict[str, Entry]: + # pdoc will inherit the docstring from the base class + # noqa: D102 + + if re.search(ARXIV_REGEX, string): + return ArxivParser().parse(string) + if re.search(ISBN_REGEX, string): + return ISBNParser().parse(string) + if re.search(DOI_REGEX, string): + return DOIParser().parse(string) + + LOGGER.error("Could not identify type of the URL: %s", string) + return OrderedDict() + + def dump(self, entry: Entry) -> None: + """We cannot dump a generic entry as a URL.""" + LOGGER.error("Cannot dump an entry as a URL.") diff --git a/tests/parsers/test_arxiv.py b/tests/parsers/test_arxiv.py index 5e937922..d914a741 100644 --- a/tests/parsers/test_arxiv.py +++ b/tests/parsers/test_arxiv.py @@ -12,6 +12,29 @@ from cobib.database import Entry from .parser_test import ParserTest +def assert_default_test_entry(entry: Entry) -> None: + """Asserts that the passed entry is the default testing entry. + + Args: + entry: the entry to assert. + """ + entry.escape_special_chars() + assert entry.label == "Cao2018" + assert entry.data["archivePrefix"] == "arXiv" + assert entry.data["arxivid"].startswith("1812.09976") + assert ( + entry.data["author"] + == "Yudong Cao and Jonathan Romero and Jonathan P. Olson and Matthias Degroote and " + + "Peter D. Johnson and M{\\'a}ria Kieferov{\\'a} and Ian D. Kivlichan and Tim Menke " + + "and Borja Peropadre and Nicolas P. D. Sawaya and Sukin Sim and Libor Veis and " + + "Al{\\'a}n Aspuru-Guzik" + ) + assert entry.data["doi"].startswith("10.1021/acs.chemrev.8b00803") + assert entry.data["title"] == "Quantum Chemistry in the Age of Quantum Computing" + assert entry.data["year"] == 2018 + assert entry.data["_download"] == "http://arxiv.org/pdf/1812.09976v2" + + class TestArxivParser(ParserTest): """Tests for coBib's ArxivParser.""" @@ -33,21 +56,7 @@ class TestArxivParser(ParserTest): pytest.skip("The requests API encountered an error. Skipping test.") entry = list(entries.values())[0] - entry.escape_special_chars() - assert entry.label == "Cao2018" - assert entry.data["archivePrefix"] == "arXiv" - assert entry.data["arxivid"].startswith("1812.09976") - assert ( - entry.data["author"] - == "Yudong Cao and Jonathan Romero and Jonathan P. Olson and Matthias Degroote and " - + "Peter D. Johnson and M{\\'a}ria Kieferov{\\'a} and Ian D. Kivlichan and Tim Menke " - + "and Borja Peropadre and Nicolas P. D. Sawaya and Sukin Sim and Libor Veis and " - + "Al{\\'a}n Aspuru-Guzik" - ) - assert entry.data["doi"].startswith("10.1021/acs.chemrev.8b00803") - assert entry.data["title"] == "Quantum Chemistry in the Age of Quantum Computing" - assert entry.data["year"] == 2018 - assert entry.data["_download"] == "http://arxiv.org/pdf/1812.09976v2" + assert_default_test_entry(entry) # regression test for https://gitlab.com/mrossinek/cobib/-/issues/57 def test_invalid_arxiv_id(self) -> None: diff --git a/tests/parsers/test_doi.py b/tests/parsers/test_doi.py index a6c8f49d..a1e18e4e 100644 --- a/tests/parsers/test_doi.py +++ b/tests/parsers/test_doi.py @@ -12,6 +12,21 @@ from cobib.database import Entry from .parser_test import ParserTest +def assert_default_test_entry(entry: Entry) -> None: + """Asserts that the passed entry is the default testing entry. + + Args: + entry: the entry to assert. + """ + reference = ParserTest.EXAMPLE_ENTRY_DICT.copy() + # In this specific case the bib file provided by this DOI includes additional (yet + # unnecessary) brackets in the escaped special characters of the author field. Thus, we + # correct for this inconsistency manually before asserting the equality. + reference["author"] = str(reference["author"]).replace("'a", "'{a}") + reference["_download"] = "https://pubs.acs.org/doi/10.1021/acs.chemrev.8b00803" + assert entry.data == reference + + class TestDOIParser(ParserTest): """Tests for coBib's DOIParser.""" @@ -25,12 +40,6 @@ class TestDOIParser(ParserTest): query: the arXiv ID or URL which to query. caplog: the built-in pytest fixture. """ - reference = self.EXAMPLE_ENTRY_DICT.copy() - # In this specific case the bib file provided by this DOI includes additional (yet - # unnecessary) brackets in the escaped special characters of the author field. Thus, we - # correct for this inconsistency manually before asserting the equality. - reference["author"] = str(reference["author"]).replace("'a", "'{a}") - reference["_download"] = "https://pubs.acs.org/doi/10.1021/acs.chemrev.8b00803" entries = parsers.DOIParser().parse(query) if ( @@ -41,7 +50,7 @@ class TestDOIParser(ParserTest): pytest.skip("The requests API encountered an error. Skipping test.") entry = list(entries.values())[0] - assert entry.data == reference + assert_default_test_entry(entry) def test_invalid_doi(self, caplog: pytest.LogCaptureFixture) -> None: """Test parsing an invalid DOI. diff --git a/tests/parsers/test_url.py b/tests/parsers/test_url.py new file mode 100644 index 00000000..964a6f1d --- /dev/null +++ b/tests/parsers/test_url.py @@ -0,0 +1,61 @@ +"""Tests for coBib's URLParser.""" +# pylint: disable=no-self-use,unused-argument + +import logging +from typing import Callable + +import pytest + +from cobib import parsers +from cobib.database import Entry + +from .parser_test import ParserTest +from .test_arxiv import assert_default_test_entry as assert_arxiv_entry +from .test_doi import assert_default_test_entry as assert_doi_entry + + +class TestURLParser(ParserTest): + """Tests for coBib's URLParser.""" + + @pytest.mark.parametrize( + ("query", "assertion"), + [ + ("https://arxiv.org/abs/1812.09976", assert_arxiv_entry), + ("https://doi.org/10.1021/acs.chemrev.8b00803", assert_doi_entry), + ], + ) + def test_from_url( + self, query: str, assertion: Callable[[Entry], None], caplog: pytest.LogCaptureFixture + ) -> None: + """Test parsing from arXiv. + + Args: + query: the URL which to query. + assertion: the assertion method to run. + caplog: the built-in pytest fixture. + """ + entries = parsers.URLParser().parse(query) + + entry = list(entries.values())[0] + assertion(entry) + + def test_invalid_url(self) -> None: + """Test parsing an invalid URL.""" + entries = parsers.URLParser().parse("https://github.com/") + assert not entries + assert entries == {} + + def test_dump(self, caplog: pytest.LogCaptureFixture) -> None: + """Test dumping. + + Args: + caplog: the built-in pytest fixture. + """ + entry = Entry("dummy", {"ENTRYTYPE": "unpublished"}) + parsers.URLParser().dump(entry) + + assert ( + "cobib.parsers.url", + logging.ERROR, + "Cannot dump an entry as a URL.", + ) in caplog.record_tuples -- GitLab From 8612d0d2d15b146994776d0ad64ba1a41e2037a8 Mon Sep 17 00:00:00 2001 From: Max Rossmannek Date: Sun, 26 Sep 2021 00:03:56 +0200 Subject: [PATCH 07/11] Fallback to most common doi on page --- src/cobib/parsers/url.py | 37 +++++++++++++++++++++++++++++++------ tests/parsers/test_url.py | 31 +++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 8 deletions(-) diff --git a/src/cobib/parsers/url.py b/src/cobib/parsers/url.py index 54f8ab11..a8a8245f 100644 --- a/src/cobib/parsers/url.py +++ b/src/cobib/parsers/url.py @@ -6,9 +6,11 @@ Todo: import logging import re -from collections import OrderedDict +from collections import Counter, OrderedDict from typing import Dict +import requests + from cobib.database import Entry from .arxiv import ARXIV_REGEX, ArxivParser @@ -29,13 +31,36 @@ class URLParser(Parser): # noqa: D102 if re.search(ARXIV_REGEX, string): - return ArxivParser().parse(string) - if re.search(ISBN_REGEX, string): - return ISBNParser().parse(string) + entries = ArxivParser().parse(string) + if entries: + return entries if re.search(DOI_REGEX, string): - return DOIParser().parse(string) + entries = DOIParser().parse(string) + if entries: + return entries + if re.search(ISBN_REGEX, string): + entries = ISBNParser().parse(string) + if entries: + return entries + + try: + page = requests.get(string, timeout=10) + except requests.exceptions.RequestException as err: + LOGGER.error("An Exception occurred while trying to query the URL: %s.", string) + LOGGER.error(err) + return OrderedDict() + + matches = re.findall(DOI_REGEX, page.text) + dois = Counter(matches) + # we assume the most common DOI on the page is the one which we are looking for + most_common_doi = dois.most_common(1)[0] + if most_common_doi[1] > 1: + entries = DOIParser().parse(most_common_doi[0]) + + if entries: + return entries - LOGGER.error("Could not identify type of the URL: %s", string) + LOGGER.error("Could not extract metadata from URL: %s", string) return OrderedDict() def dump(self, entry: Entry) -> None: diff --git a/tests/parsers/test_url.py b/tests/parsers/test_url.py index 964a6f1d..6e00e7a5 100644 --- a/tests/parsers/test_url.py +++ b/tests/parsers/test_url.py @@ -14,14 +14,41 @@ from .test_arxiv import assert_default_test_entry as assert_arxiv_entry from .test_doi import assert_default_test_entry as assert_doi_entry +def assert_default_test_entry(entry: Entry) -> None: + """Asserts that the passed entry is the default testing entry. + + Args: + entry: the entry to assert. + """ + entry.escape_special_chars() + assert entry.label == "Grimsley_2019" + assert entry.data["doi"] == "10.1038/s41467-019-10988-2" + assert entry.data["url"] == ["https://doi.org/10.1038%2Fs41467-019-10988-2"] + assert entry.data["year"] == 2019 + assert entry.data["month"] == "jul" + assert entry.data["publisher"] == "Springer Science and Business Media {LLC}" + assert entry.data["volume"] == 10 + assert entry.data["number"] == 1 + assert ( + entry.data["author"] + == "Harper R. Grimsley and Sophia E. Economou and Edwin Barnes and Nicholas J. Mayhall" + ) + assert ( + entry.data["title"] + == "An adaptive variational algorithm for exact molecular simulations on a quantum computer" + ) + assert entry.data["journal"] == "Nature Communications" + + class TestURLParser(ParserTest): """Tests for coBib's URLParser.""" @pytest.mark.parametrize( ("query", "assertion"), [ - ("https://arxiv.org/abs/1812.09976", assert_arxiv_entry), - ("https://doi.org/10.1021/acs.chemrev.8b00803", assert_doi_entry), + # ("https://arxiv.org/abs/1812.09976", assert_arxiv_entry), + # ("https://doi.org/10.1021/acs.chemrev.8b00803", assert_doi_entry), + ("https://www.nature.com/articles/s41467-019-10988-2", assert_default_test_entry), ], ) def test_from_url( -- GitLab From 397e68e753cce644aba6bc4a8ad5f925337a0519 Mon Sep 17 00:00:00 2001 From: Max Rossmannek Date: Sun, 26 Sep 2021 00:09:08 +0200 Subject: [PATCH 08/11] Add comments to URLParser --- src/cobib/parsers/url.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/cobib/parsers/url.py b/src/cobib/parsers/url.py index a8a8245f..288d2972 100644 --- a/src/cobib/parsers/url.py +++ b/src/cobib/parsers/url.py @@ -31,16 +31,22 @@ class URLParser(Parser): # noqa: D102 if re.search(ARXIV_REGEX, string): + LOGGER.debug("URL contains an arXiv ID") entries = ArxivParser().parse(string) if entries: + LOGGER.debug("Successfully extracted metadata from URL with ArxivParser") return entries if re.search(DOI_REGEX, string): + LOGGER.debug("URL contains a DOI") entries = DOIParser().parse(string) if entries: + LOGGER.debug("Successfully extracted metadata from URL with DOIParser") return entries if re.search(ISBN_REGEX, string): + LOGGER.debug("URL contains an ISBN") entries = ISBNParser().parse(string) if entries: + LOGGER.debug("Successfully extracted metadata from URL with ISBNParser") return entries try: @@ -50,14 +56,17 @@ class URLParser(Parser): LOGGER.error(err) return OrderedDict() + LOGGER.debug("Falling back to determining most common DOI in URLs page contents") matches = re.findall(DOI_REGEX, page.text) dois = Counter(matches) # we assume the most common DOI on the page is the one which we are looking for most_common_doi = dois.most_common(1)[0] + LOGGER.debug("Most common DOI is: %s", most_common_doi) if most_common_doi[1] > 1: entries = DOIParser().parse(most_common_doi[0]) if entries: + LOGGER.debug("Successfully extracted metadata from most common DOI") return entries LOGGER.error("Could not extract metadata from URL: %s", string) -- GitLab From 2b98fcc208b98518e9a0698fa4cc03649b957d51 Mon Sep 17 00:00:00 2001 From: Max Rossmannek Date: Sun, 26 Sep 2021 00:11:29 +0200 Subject: [PATCH 09/11] Re-enable falsely disabled unittests --- tests/parsers/test_url.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/parsers/test_url.py b/tests/parsers/test_url.py index 6e00e7a5..6cb394db 100644 --- a/tests/parsers/test_url.py +++ b/tests/parsers/test_url.py @@ -46,8 +46,8 @@ class TestURLParser(ParserTest): @pytest.mark.parametrize( ("query", "assertion"), [ - # ("https://arxiv.org/abs/1812.09976", assert_arxiv_entry), - # ("https://doi.org/10.1021/acs.chemrev.8b00803", assert_doi_entry), + ("https://arxiv.org/abs/1812.09976", assert_arxiv_entry), + ("https://doi.org/10.1021/acs.chemrev.8b00803", assert_doi_entry), ("https://www.nature.com/articles/s41467-019-10988-2", assert_default_test_entry), ], ) -- GitLab From d22ac2d9566c828d3901d0fad86f35b5313f146c Mon Sep 17 00:00:00 2001 From: Max Rossmannek Date: Sun, 26 Sep 2021 00:12:49 +0200 Subject: [PATCH 10/11] Guard against no DOIs on URL page --- src/cobib/parsers/url.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cobib/parsers/url.py b/src/cobib/parsers/url.py index 288d2972..c60569cb 100644 --- a/src/cobib/parsers/url.py +++ b/src/cobib/parsers/url.py @@ -59,6 +59,9 @@ class URLParser(Parser): LOGGER.debug("Falling back to determining most common DOI in URLs page contents") matches = re.findall(DOI_REGEX, page.text) dois = Counter(matches) + if not dois: + LOGGER.error("Could not find any DOIs on the URLs page: %s", string) + return OrderedDict() # we assume the most common DOI on the page is the one which we are looking for most_common_doi = dois.most_common(1)[0] LOGGER.debug("Most common DOI is: %s", most_common_doi) -- GitLab From fb3f1868b520bc5ed0d88fed84a8bba97273809a Mon Sep 17 00:00:00 2001 From: Max Rossmannek Date: Sun, 26 Sep 2021 00:18:17 +0200 Subject: [PATCH 11/11] Update docs --- src/cobib/commands/add.py | 2 ++ src/cobib/parsers/arxiv.py | 2 ++ src/cobib/parsers/doi.py | 2 ++ src/cobib/parsers/isbn.py | 2 ++ src/cobib/parsers/url.py | 14 ++++++++++++-- 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/cobib/commands/add.py b/src/cobib/commands/add.py index 55337fd8..65e16df1 100644 --- a/src/cobib/commands/add.py +++ b/src/cobib/commands/add.py @@ -15,6 +15,7 @@ cobib add --bibtex some_biblatex_file.bib cobib add --arxiv cobib add --doi cobib add --isbn +cobib add --url cobib add --yaml some_cobib_style_yaml_file.yaml ``` @@ -127,6 +128,7 @@ class AddCommand(Command): args: a sequence of additional arguments used for the execution. The following values are allowed for this command: * `-l`, `--label`: the label to give to the new entry. + * `-u`, `--update`: updates an existing database entry if it already exists. * `-f`, `--file`: one or multiple files to associate with this entry. This data will be stored in the `cobib.database.Entry.file` property. * `-p`, `--path`: the path to store the downloaded associated file in. This can diff --git a/src/cobib/parsers/arxiv.py b/src/cobib/parsers/arxiv.py index 5bf25e1b..5606b2a8 100644 --- a/src/cobib/parsers/arxiv.py +++ b/src/cobib/parsers/arxiv.py @@ -6,6 +6,8 @@ It gathers the BibTex-encoded data from the arXiv API and parses the raw XML dat Since v3.2.0 coBib will also automatically download the PDF version of the new entry. You can configure the default download location via `config.utils.file_downloader.default_location`. +Since v3.3.0 this parser even supports URLs from which an arXiv ID can be extracted directly. + The parser is registered under the `-a` and `--arxiv` command-line arguments of the `cobib.commands.add.AddCommand`. diff --git a/src/cobib/parsers/doi.py b/src/cobib/parsers/doi.py index f700fcc3..be79c45d 100644 --- a/src/cobib/parsers/doi.py +++ b/src/cobib/parsers/doi.py @@ -13,6 +13,8 @@ Furthermore, you should look into the `config.utils.file_downloader.url_map` set you tell coBib how to map from journal landing page URLs to the corresponding PDF URLs. For more information check out `cobib.config.example` and the man-page. +Since v3.3.0 this parser even supports URLs from which a DOI can be extracted directly. + The parser is registered under the `-d` and `--doi` command-line arguments of the `cobib.commands.add.AddCommand`. diff --git a/src/cobib/parsers/isbn.py b/src/cobib/parsers/isbn.py index e7e466b4..460ff3c5 100644 --- a/src/cobib/parsers/isbn.py +++ b/src/cobib/parsers/isbn.py @@ -7,6 +7,8 @@ Note, that the openlibrary API does not contain all ISBNs and potential server e by the parser. In the future, I hope to make the API backend configurable. +Since v3.3.0 this parser also supports URLs from which an ISBN can be extracted directly. + The parser is registered under the `-i` and `--isbn` command-line arguments of the `cobib.commands.add.AddCommand`. diff --git a/src/cobib/parsers/url.py b/src/cobib/parsers/url.py index c60569cb..343fdf8e 100644 --- a/src/cobib/parsers/url.py +++ b/src/cobib/parsers/url.py @@ -1,7 +1,16 @@ """coBib's URL parser. -Todo: - - Documentation +This parser is capable of generating `cobib.database.Entry` instances from general URLs. +This parser checks the URL for contained arXiv IDs, DOIs, and ISBNs in this order. +If none of the above match (or fail to return an `Entry`) it will fall back and extract all DOIs +from the website which the URL is pointing to. It will then use the most common DOI (if it occurs +more often than once) and use that as the DOI to which this URL supposedly redirects. + +The parser is registered under the `--url` command-line argument of the +`cobib.commands.add.AddCommand`. + +The following documentation is mostly inherited from the abstract interface +`cobib.parsers.base_parser`. """ import logging @@ -22,6 +31,7 @@ LOGGER = logging.getLogger(__name__) class URLParser(Parser): + # pylint: disable=too-many-return-statements """The URL Parser.""" name = "url" -- GitLab