diff options
| author | n1 <hrdina.pavel@gmail.com> | 2019-07-27 13:25:13 +0200 |
|---|---|---|
| committer | n1 <hrdina.pavel@gmail.com> | 2019-07-27 13:25:13 +0200 |
| commit | 2f2c732c35e67a1beb9ed760e95b627ebe8d63b7 (patch) | |
| tree | 8e5ce9d4b8901654e34d3cdf07e58e30f21328c5 | |
| parent | 106b58b52dc2bf34e64f87be3c54e53fecbf3f88 (diff) | |
Added: top news method.0.2.5
| -rw-r--r-- | README.rst | 63 | ||||
| -rw-r--r-- | karpet/core.py | 199 | ||||
| -rw-r--r-- | karpet/meta.py | 2 | ||||
| -rw-r--r-- | test_karpet.py | 29 |
4 files changed, 222 insertions, 71 deletions
@@ -61,8 +61,8 @@ Symbol (ticker) -> coninmarketcap.com URL slug conversion. .. code-block:: python - c = Karpet() - c.get_coin_slug("BTC") # bitcoin + k = Karpet() + k.get_coin_slug("BTC") # bitcoin ``fetch_historical_data()`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -70,8 +70,8 @@ Retrieves historical data. .. code-block:: python - c = Karpet(date(2019, 1, 1), date(2019, 5, 1)) - df = c.fetch_crypto_historical_data(coin="bitcoin") # Dataframe with historical data + k = Karpet(date(2019, 1, 1), date(2019, 5, 1)) + df = k.fetch_crypto_historical_data(symbol="btc") # Dataframe with historical data. df.head() .. image:: https://raw.githubusercontent.com/im-n1/karpet/master/assets/historical_data.png @@ -82,8 +82,8 @@ Retrieves exchange list. .. code-block:: python - c = Karpet() - c.fetch_exchanges("nrg") + k = Karpet() + k.fetch_exchanges("nrg") ['DigiFinex', 'KuCoin', 'CryptoBridge', 'Bitbns', 'CoinExchange'] ``fetch_tweets()`` @@ -92,8 +92,8 @@ Retrieves twitter tweets. .. code-block:: python - c = Karpet(date(2019, 1, 1), date(2019, 5, 1)) - df = c.fetch_tweets(kw_list=["bitcoin"], lang="en") # Dataframe with tweets. + k = Karpet(date(2019, 1, 1), date(2019, 5, 1)) + df = k.fetch_tweets(kw_list=["bitcoin"], lang="en") # Dataframe with tweets. df.head() .. image:: https://raw.githubusercontent.com/im-n1/karpet/master/assets/tweets.png @@ -104,8 +104,8 @@ Retrieves Google Trends - in percents for the given date range. .. code-block:: python - c = Karpet(date(2019, 1, 1), date(2019, 5, 1)) - df = c.fetch_google_trends(kw_list=["bitcoin"]) # Dataframe with trends. + k = Karpet(date(2019, 1, 1), date(2019, 5, 1)) + df = k.fetch_google_trends(kw_list=["bitcoin"]) # Dataframe with trends. df.head() .. image:: https://raw.githubusercontent.com/im-n1/karpet/master/assets/google_trends.png @@ -125,8 +125,8 @@ Retrieves crypto news. .. code-block:: python - c = Karpet() - news = c.fetch_news("btc") # Gets 10 news. + k = Karpet() + news = k.fetch_news("btc") # Gets 10 news. print(news[0]) { 'url': 'https://cointelegraph.com/ ....', # Truncated. @@ -135,10 +135,47 @@ Retrieves crypto news. 'date': datetime.datetime(2019, 7, 10, 19, 0, 13), 'image': 'https://images.cointelegraph.com/....jpg' # Truncated. } - news = c.fetch_news("btc", limit=30) # Gets 30 news. + news = k.fetch_news("btc", limit=30) # Gets 30 news. + +``fetch_top_news()`` +~~~~~~~~~~~~~~~~~~~~ +Retrieves top crypto news in 2 categories: + +* Editor's choices - articles picked by editors +* Hot stories - articles with most views + +.. code-block:: python + + k = Karpet() + editors_choices, top_stories = k.fetch_top_news() + print(len(editors_choices)) + 5 + print(len(top_stories)) + 5 + print(editors_choices[0]) + { + 'url': 'https://cointelegraph.com/...', # Truncated. + 'title': 'Bank of China’s New Infographic Shows Why Bitcoin Price Is Going Up', + 'date': '2019-07-27T10:07:00+01:00', + 'image': 'https://images.cointelegraph.com/images/740_aHR...', # Truncated. + 'description': 'The Chinese central bank released on its website an ...' # Truncated. + } + print(top_stories[0]) + { + 'url': 'https://cointelegraph.com/...', # Truncated. + 'title': 'Bitcoin Price Shuns Volatility as Analysts Warn of Potential Drop to $7,000', + 'date': '2019-07-22T09:21:00+01:00', + 'image': 'https://images.cointelegraph.com/images/740_aHR0c...' # Truncated. + 'description': 'Stability around $10,600 for Bitcoin price is ...' # Truncated. + } + Changelog --------- +0.2.5 +~~~~~ +* Added ``fetch_top_news()`` method for top crypto news separated in 2 categories. + 0.2.4 ~~~~~ * ``fetch_news()`` adds new "description" item and renames "image_url" to "image". diff --git a/karpet/core.py b/karpet/core.py index 65d663a..ad44c8b 100644 --- a/karpet/core.py +++ b/karpet/core.py @@ -11,9 +11,8 @@ import requests import aiohttp import re -from datetime import timedelta, datetime +from datetime import timedelta import time -import sys import asyncio @@ -311,8 +310,24 @@ class Karpet: * description * date * image + + :param str symbol: Coin symbol the news will be fetched for. + :param int limit: Limit for news count. """ + def get_news(symbol, limit): + """ + Fetches news from coincodex.com. + + :return: List of news urls. + :rtype: list + """ + + url = f"https://coincodex.com/api/coincodexicos/get_news/{symbol}/{limit}/1/" + data = self._get_json(url) + + return [{"url": d["url"]} for d in data] + def get_coin_slug(symbol): """ Determines coin coincodex.com URL slug for the given @@ -339,78 +354,158 @@ class Karpet: if c["symbol"].upper() == symbol.upper(): return c["shortname"] - async def fetch_features(news): + # Fetch features. + news = get_news(symbol, limit) + asyncio.run(self._fetch_news_features(news)) + + return self._drop_bad_news(news)[:limit] + + def fetch_top_news(self): + """ + Fetches top crypto news. Returns Editor's choice and Hot stories. + + * url + * title + * description + * date + * image + + :return: Tuple where first are editors choice news and second hot stories. + :rtype: tuple + """ + + def get_top_news(): """ - Asynchronously fetches all news features. + Fetches editors choice and hot stories from cointelegraph.com front page. - :param list news: List of news objects. + :return: Dict with ``editors_choice`` and ``hot_stories`` items. + :rtype: dict """ - async def fetch_all(session, news): + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0", + } + response = requests.get("https://cointelegraph.com/", headers=headers) + dom = BeautifulSoup(response.text, "lxml") + + def parse_section_news(section): """ - Fetches all news features. + Parse section with news. Section contains of titles and + links to the news where only links are parsed out. - :param aiohttp.ClientSession session: Session instance. - :param list news: List of news objects. + :param object section: BeautifulSoap element object of the section. + :return: List of news objects - {"url": "..."}. + :rtype: list """ - await asyncio.gather(*[fetch_one(session, n) for n in news]) + news_items = section.find_all(class_="main-news-tabs__item") + news = [] - async def fetch_one(session, news): - """ - Fetches a few features to the given news object. Features - are set directly to the news object. - Fetched features are: + if len(news_items): - * image - * description + for i in news_items: + news.append({"url": i.find("a")["href"]}) - :param aiohttp.ClientSession session: Session instance. - :param object news: News object. - """ + return news + + editors_choice, hot_stories = dom.find_all(class_="main-news-tabs__list") + + return parse_section_news(editors_choice), parse_section_news(hot_stories) + + # Fetch features. + editors_choice, hot_stories = get_top_news() + asyncio.run(self._fetch_news_features(editors_choice)) + print("here") + asyncio.run(self._fetch_news_features(hot_stories)) + print("here 2") - async with session.get(news["url"]) as response: + return editors_choice, hot_stories - html = await response.text() - dom = BeautifulSoup(html, features="lxml") + async def _fetch_news_features(self, news): + """ + Asynchronously fetches all news features. - # Image. - try: - news["image"] = dom.find("meta", {"property": "og:image"})["content"] - except: - news["image"] = None + :param list news: List of news objects. + """ - # Description. - try: - news["description"] = dom.find("meta", {"property": "og:description"})["content"] - except: - news["description"] = None + async def fetch_all(session, news): + """ + Fetches all news features. - async with aiohttp.ClientSession() as session: - await fetch_all(session, news) + :param aiohttp.ClientSession session: Session instance. + :param list news: List of news objects. + """ - url = f"https://coincodex.com/api/coincodexicos/get_news/{symbol}/{limit}/1/" - data = self.get_json(url) + await asyncio.gather(*[fetch_one(session, n) for n in news]) - news = [] + async def fetch_one(session, news): + """ + Fetches a few features to the given news object. Features + are set directly to the news object. + Fetched features are: - for n in data: - try: - news.append({ - "url": n["url"], - "title": n["title"], - "date": datetime.strptime(n["date"], "%Y-%m-%d %H:%M:%S") - }) - except: - tb = sys.exc_info()[2] - raise Exception("Couldn't parse news. Skipping...").with_traceback(tb) + * date + * image + * description + + :param aiohttp.ClientSession session: Session instance. + :param object news: News object. + """ + + async with session.get(news["url"]) as response: + + html = await response.text() + dom = BeautifulSoup(html, features="lxml") + + # Title. + try: + news["title"] = dom.find("meta", {"property": "og:title"})["content"] + except: + news["title"] = None + + # Date. + try: + news["date"] = dom.find("meta", {"property": "article:published_time"})["content"] + except: + news["date"] = None + + # Image. + try: + news["image"] = dom.find("meta", {"property": "og:image"})["content"] + except: + news["image"] = None + + # Description. + try: + news["description"] = dom.find("meta", {"property": "og:description"})["content"] + except: + news["description"] = None + + async with aiohttp.ClientSession() as session: + await fetch_all(session, news) + + def _drop_bad_news(self, news): + """ + Drops news that doesn't suit following requirements. + + * must have published date (date) + + :param list news: List of news. + :return: Filtered list of news. + :rtype: list + """ + + filtered_news = [] + + for n in news: + if not n["date"]: + continue - # Fetch news features. - asyncio.run(fetch_features(news)) + filtered_news.append(n) - return news + return filtered_news - def get_json(self, url): + def _get_json(self, url): """ Downloads data from the given URL and parses them as JSON. Handles exception and raises own ones with sane messages. diff --git a/karpet/meta.py b/karpet/meta.py index c698071..fdbef70 100644 --- a/karpet/meta.py +++ b/karpet/meta.py @@ -1,2 +1,2 @@ -__version__ = "0.2.4.1" +__version__ = "0.2.5" __description__ = "Library for fetching coin/token metrics data from the internet." diff --git a/test_karpet.py b/test_karpet.py index c0f93f8..222337f 100644 --- a/test_karpet.py +++ b/test_karpet.py @@ -45,10 +45,10 @@ def test_fetch_tweets(): def test_fetch_news(): - c = Karpet() - news = c.fetch_news("eth") + k = Karpet() + news = k.fetch_news("eth") - assert len(news) == 10 + assert len(news) > 0 assert "url" in news[0] assert "title" in news[0] assert "date" in news[0] @@ -56,6 +56,25 @@ def test_fetch_news(): def test_fetch_news_with_limit(): - c = Karpet() + k = Karpet() + news = k.fetch_news("eth", limit=30) + + assert 0 < len(news) <= 30 + print(f"Fetched {len(news)} news.") + + +def test_fetch_top_news(): + + k = Karpet() + editors_choice, hot_stories = k.fetch_top_news() + + assert len(editors_choice) == 5 + assert len(hot_stories) == 5 + + assert "url" in editors_choice[0] + assert "title" in editors_choice[0] + assert "date" in editors_choice[0] - assert len(c.fetch_news("eth", limit=30)) == 30 + assert "url" in hot_stories[0] + assert "title" in hot_stories[0] + assert "date" in hot_stories[0] |