Merge pull request #226 from Crinibus/add-database

Add database
Crinibus · Nov 17, 2023 · 964d546 · 964d546
2 parents c0b21eb + 2c8a945
commit 964d546
Show file tree

Hide file tree

Showing 28 changed files with 890 additions and 608 deletions.
diff --git a/README.md b/README.md
@@ -57,6 +57,30 @@ In version v2.3.0, I have add the column ```short_url``` to ```products.csv```.
 </p>
 </details>
 
+<details><summary><h2>UPDATE TO HOW DATA IS STORED IN V3.0.0</h2></summary>
+<p>
+
+In version v3.0.0, I have changed where data is stored from a json file to a SQLite database. If you have data from before v3.0.0, then run the following commands in an interactive python session to add the data from records.json to the database (**OBS: Pandas is required**):
+```
+>>> from scraper.format_to_new import Format
+>>> Format.from_json_to_db()
+```
+
+<br/>
+
+**NOTE:** This will replace the content in the database with what is in records.json. That means if you have products and/or datapoints in the database but not records.json, they will be deleted.
+
+
+<br/>
+
+OBS: If you doesn't have Pandas installed run this command:
+```
+pip3 install pandas
+```
+
+</p>
+</details>
+
 <br/>
 
 
@@ -147,6 +171,19 @@ python3 main.py -s --threads
 
 <br/>
 
+## Activating and deactivating products
+
+When you add a new product the product is activated to be scraped. If you wish to not scrape a product anymore, you can deactivate the product with the following command:
+```
+python3 main.py --deactivate --id <id>
+```
+
+You can activate a product again with the following command:
+```
+python3 main.py --activate --id <id>
+```
+
+<br/>
 
 ## Delete data <a name="delete-data"></a>
 
@@ -171,14 +208,13 @@ Then just add products like described [here](#add-products).
 
 <br/>
 
-If you just want to reset your data for every product, deleting all datapoints inside every product, then run this command:
+If you just want to delete all datapoints for every product, then run this command:
 ```
 python3 main.py --reset --all
 ```
-This deletes the data inside each product, such as id, url and all datapoints.
 
 
-You can also just reset some products or all products in some categories:
+You can also just delete datapoints for some products:
 ```
 python3 main.py --reset --id <id>
 ```
@@ -274,8 +310,11 @@ This will print all the products in the following format:
 CATEGORY
   > PRODUCT NAME
     - WEBSITE NAME - PRODUCT ID
+    - ✓ WEBSITE NAME - PRODUCT ID
 ```
 
+The check mark (✓) shows that the product is activated.
+
 <br/>
 
 

diff --git a/main.py b/main.py
@@ -1,4 +1,3 @@
-from typing import List
 import threading
 import logging.config
 import logging
@@ -13,7 +12,7 @@ def main() -> None:
     args = scraper.argparse_setup()
 
     if args.clean_data:
-        scraper.clean_records_data()
+        scraper.clean_datapoints()
 
     if args.visualize:
         scraper.visualize_data(args.all, args.category, args.id, args.name, args.up_to_date, args.compare)
@@ -24,6 +23,12 @@ def main() -> None:
     if args.add:
         scraper.add_products(args.category, args.url)
 
+    if args.activate:
+        scraper.update_products_is_active_with_product_codes(args.id, True)
+
+    if args.deactivate:
+        scraper.update_products_is_active_with_product_codes(args.id, False)
+
     if args.search:
         scraper.search(args.search)
 
@@ -34,7 +39,7 @@ def main() -> None:
             scrape()
 
     if args.latest_datapoint:
-        scraper.print_latest_datapoints(args.name, args.id)
+        scraper.print_latest_datapoints(args.name, args.id, args.category)
 
     if args.print_all_products:
         scraper.print_all_products()
@@ -47,18 +52,17 @@ def scrape() -> None:
     print("Scraping...")
 
     request_delay = scraper.Config.get_request_delay()
-    products_df = scraper.Filemanager.get_products_data()
+    active_products = scraper.db.get_all_products(select_only_active=True)
 
-    # Create instances of class "Scraper"
-    products = [scraper.Scraper(category, url) for category, url in zip(products_df["category"], products_df["url"])]
+    products = scraper.Format.db_products_to_scrapers(active_products)
 
     with alive_progress.alive_bar(len(products), title="Scraping") as bar:
         # Scrape and save scraped data for each product (sequentially)
         for product in products:
             bar.text = f"-> {product.url}"
             time.sleep(request_delay)
             product.scrape_info()
-            product.save_info()
+            scraper.add_product.add_new_datapoint_with_scraper(product)
             bar()
 
 
@@ -67,18 +71,21 @@ def scrape_with_threads() -> None:
 
     request_delay = scraper.Config.get_request_delay()
 
-    products_df = scraper.Filemanager.get_products_data()
-    domain_grouped_products_df = scraper.get_products_df_grouped_by_domains(products_df)
-    grouped_products = scraper.get_products_grouped_by_domain(domain_grouped_products_df)
+    grouped_db_products = scraper.db.get_all_products_grouped_by_domains(select_only_active=True)
+    grouped_products: list[list[scraper.Scraper]] = []
+
+    for db_products in grouped_db_products:
+        products = scraper.Format.db_products_to_scrapers(db_products)
+        grouped_products.append(products)
 
-    grouped_scraper_threads: List[List[threading.Thread]] = []
+    grouped_scraper_threads: list[list[threading.Thread]] = []
 
     # Create scraper threads and group by domain
-    for products in grouped_products.values():
+    for products in grouped_products:
         scraper_threads = [threading.Thread(target=product.scrape_info) for product in products]
         grouped_scraper_threads.append(scraper_threads)
 
-    products_flatten = [product for products in grouped_products.values() for product in products]
+    products_flatten = [product for products in grouped_products for product in products]
 
     with alive_progress.alive_bar(len(products_flatten), title="Scraping with threads") as progress_bar:
         # Create master threads to manage scraper threads sequentially for each domain
@@ -97,10 +104,11 @@ def scrape_with_threads() -> None:
 
     # Save scraped data for each product (sequentially)
     for product in products_flatten:
-        product.save_info()
+        scraper.add_product.add_new_datapoint_with_scraper(product)
 
 
 if __name__ == "__main__":
+    scraper.db.create_db_and_tables()
     logging.config.fileConfig(
         fname=scraper.Filemanager.logging_ini_path,
         defaults={"logfilename": scraper.Filemanager.logfile_path},

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,2 +1,8 @@
 [tool.black]
 line-length = 127
+
+[tool.ruff]
+line-length = 127
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402"]
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
 requests>=2.24.0
 beautifulsoup4>=4.9.1
 plotly>=4.12.0
-pandas>=1.1.3
 pytest>=7.1.2
 pytest-mock>=3.8.2
 alive-progress>=2.4.1
 flake8>=6.0.0
+sqlmodel>=0.0.8
diff --git a/scraper/__init__.py b/scraper/__init__.py
@@ -1,14 +1,15 @@
 from .scrape import Scraper, start_threads_sequentially
 from .arguments import argparse_setup
-from .add_product import add_products
+from .add_product import add_products, update_products_is_active_with_product_codes
 from .filemanager import Filemanager, Config
 from .visualize import visualize_data
-from .clean_data import clean_records_data
+from .clean_data import clean_datapoints
 from .delete_data import delete
 from .reset_data import reset
 from .search_data import search
 from .print_products import print_latest_datapoints, print_all_products
-from .misc import get_products_df_grouped_by_domains, get_products_grouped_by_domain
+from .format import Format
+import scraper.database as db
 
 
 __author__ = "Crinibus"
diff --git a/scraper/add_product.py b/scraper/add_product.py
@@ -1,13 +1,15 @@
-from typing import List
 import logging
+from datetime import datetime
+
+import scraper.database as db
 from scraper.exceptions import WebsiteNotSupported, URLMissingSchema
+from scraper.format import Format
 from scraper.scrape import Scraper
-from scraper.filemanager import Filemanager
 from scraper.domains import get_website_name, SUPPORTED_DOMAINS
 from scraper.constants import URL_SCHEMES
 
 
-def add_products(categories: List[str], urls: List[str]) -> None:
+def add_products(categories: list[str], urls: list[str]) -> None:
     for category, url in zip(categories, urls):
         try:
             add_product(category, url)
@@ -31,77 +33,79 @@ def add_product(category: str, url: str) -> None:
     logger.info(f"Adding product with category '{category}' and url '{url}'")
 
     new_product = Scraper(category, url)
-    new_product.scrape_info()
+    new_product_info = new_product.scrape_info()
+
+    product_in_db = db.get_product_by_product_code(new_product_info.id)
 
-    if not check_if_product_exists(new_product):
-        save_product(new_product)
+    if product_in_db is None:
+        add_new_product_to_db(new_product)
+        add_new_datapoint_with_scraper(new_product)
+        return
+
+    logger.info("Product with the same product code already exists in database")
+
+    if product_in_db.is_active:
+        print("Product with the same product code already exists in database and is active")
         return
 
     user_input = input(
-        "A product with the same name and from the same website already exist in your data, "
-        "do you want to override this product? (y/n) > "
+        "A product with the same product id already exist in the database but is not active, "
+        "do you want to activate it? (y/n) > "
     )
 
     if user_input.lower() in ("y", "yes"):
-        print("Overriding product...")
-        save_product(new_product)
+        print("Activating product...")
+        set_existing_product_is_active(product_in_db, True)
+        logger.info("Product has been activated")
     else:
-        print("Product was not added nor overrided")
-        logger.info("Adding product cancelled")
-
-
-def check_if_product_exists(product: Scraper) -> bool:
-    data = Filemanager.get_record_data()
+        print("Product has not been activated")
+        logger.info("Product not activated")
 
-    category = product.category
-    product_name = product.product_info.name
-    website_name = product.website_handler.website_name
 
-    try:
-        data[category][product_name][website_name]
-    except KeyError:
-        return False
+def add_new_product_to_db(product: Scraper) -> None:
+    product_to_db = Format.scraper_to_db_product(product, True)
+    db.add(product_to_db)
 
-    return True
 
+def add_new_datapoint_to_db(product_code: str, price: float, currency: str, date: str | None = None):
+    """Parameter 'date' defaults to the date of today in the format: YYYY-MM-DD"""
+    if date is None:
+        date = datetime.today().strftime("%Y-%m-%d")
 
-def save_product(product: Scraper) -> None:
-    add_product_to_records(product)
-
-    if not check_if_product_exists_csv(product):
-        Filemanager.add_product_to_csv(product.category, product.url, product.website_handler.get_short_url())
-
-    product.save_info()
-
+    new_datapoint = db.DataPoint(
+        product_code=product_code,
+        date=date,
+        price=price,
+        currency=currency,
+    )
 
-def add_product_to_records(product: Scraper) -> None:
-    data = Filemanager.get_record_data()
+    db.add(new_datapoint)
 
-    category = product.category
-    product_name = product.product_info.name
-    website_name = product.website_handler.website_name
 
-    empty_product_dict = {website_name: {"info": {}, "datapoints": []}}
+def add_new_datapoint_with_scraper(product: Scraper, date: str | None = None) -> None:
+    if not product.product_info or not product.product_info.valid:
+        print(f"Product info is not valid - category: '{product.category}' - url: {product.url}")
+        return
 
-    if not data.get(category):
-        data.update({category: {product_name: empty_product_dict}})
+    product_code = product.product_info.id
+    price = product.product_info.price
+    currency = product.product_info.currency
 
-    if data[category].get(product_name):
-        data[category][product_name].update(empty_product_dict)
-    else:
-        data[category].update({product_name: empty_product_dict})
+    add_new_datapoint_to_db(product_code, price, currency, date)
 
-    Filemanager.save_record_data(data)
 
+def update_products_is_active_with_product_codes(product_codes: list[str], is_active: bool) -> None:
+    action = "Activating" if is_active else "Deactivating"
 
-def check_if_product_exists_csv(product: Scraper) -> bool:
-    products_df = Filemanager.get_products_data()
+    for product_code in product_codes:
+        print(f"{action} {product_code}")
+        product = db.get_product_by_product_code(product_code)
+        set_existing_product_is_active(product, is_active)
 
-    for category, url in zip(products_df["category"], products_df["url"]):
-        if product.category.lower() == category.lower() and product.url == url:
-            return True
 
-    return False
+def set_existing_product_is_active(product: db.Product, is_active: bool) -> None:
+    product.is_active = is_active
+    db.add(product)
 
 
 def is_missing_url_schema(url: str) -> bool:

diff --git a/scraper/arguments.py b/scraper/arguments.py
@@ -33,6 +33,10 @@ def argparse_setup() -> argparse.Namespace:
 
     parser.add_argument("-u", "--url", help="the url to the product", type=str, nargs="*", action="extend")
 
+    parser.add_argument("--activate", help="activate a product to be scraped", action="store_true")
+
+    parser.add_argument("--deactivate", help="deactivate a product to not be scraped", action="store_true")
+
     parser.add_argument(
         "-v",
         "--visualize",
@@ -140,6 +144,12 @@ def validate_arguments(parser: argparse.ArgumentParser) -> argparse.Namespace:
     if args.add and args.visualize:
         parser.error("Cannot use --add and --visualize at the same time")
 
+    if args.activate and args.deactivate:
+        parser.error("Cannot use --activate and --deactivate at the same time")
+
+    if (args.activate or args.deactivate) and not args.id:
+        parser.error("When using --activate or --deactivate, then --id is required")
+
     if args.delete:
         if args.all and any([args.category, args.name, args.id]):
             parser.error("When using --delete and --all, then using --category, --name or --id does nothing")
@@ -163,7 +173,7 @@ def validate_arguments(parser: argparse.ArgumentParser) -> argparse.Namespace:
             )
 
     if args.latest_datapoint:
-        if not args.name and not args.id:
-            parser.error("When using --latest-datapoint, then --name or --id is required")
+        if not any([args.name, args.id, args.category]):
+            parser.error("When using --latest-datapoint, then --name, --id or --category is required")
 
     return args