Skip to content

Commit

Permalink
Merge pull request #226 from Crinibus/add-database
Browse files Browse the repository at this point in the history
Add database
  • Loading branch information
Crinibus authored Nov 17, 2023
2 parents c0b21eb + 2c8a945 commit 964d546
Show file tree
Hide file tree
Showing 28 changed files with 890 additions and 608 deletions.
45 changes: 42 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,30 @@ In version v2.3.0, I have add the column ```short_url``` to ```products.csv```.
</p>
</details>

<details><summary><h2>UPDATE TO HOW DATA IS STORED IN V3.0.0</h2></summary>
<p>

In version v3.0.0, I have changed where data is stored from a json file to a SQLite database. If you have data from before v3.0.0, then run the following commands in an interactive python session to add the data from records.json to the database (**OBS: Pandas is required**):
```
>>> from scraper.format_to_new import Format
>>> Format.from_json_to_db()
```

<br/>

**NOTE:** This will replace the content in the database with what is in records.json. That means if you have products and/or datapoints in the database but not records.json, they will be deleted.


<br/>

OBS: If you doesn't have Pandas installed run this command:
```
pip3 install pandas
```

</p>
</details>

<br/>


Expand Down Expand Up @@ -147,6 +171,19 @@ python3 main.py -s --threads

<br/>

## Activating and deactivating products

When you add a new product the product is activated to be scraped. If you wish to not scrape a product anymore, you can deactivate the product with the following command:
```
python3 main.py --deactivate --id <id>
```

You can activate a product again with the following command:
```
python3 main.py --activate --id <id>
```

<br/>

## Delete data <a name="delete-data"></a>

Expand All @@ -171,14 +208,13 @@ Then just add products like described [here](#add-products).

<br/>

If you just want to reset your data for every product, deleting all datapoints inside every product, then run this command:
If you just want to delete all datapoints for every product, then run this command:
```
python3 main.py --reset --all
```
This deletes the data inside each product, such as id, url and all datapoints.


You can also just reset some products or all products in some categories:
You can also just delete datapoints for some products:
```
python3 main.py --reset --id <id>
```
Expand Down Expand Up @@ -274,8 +310,11 @@ This will print all the products in the following format:
CATEGORY
> PRODUCT NAME
- WEBSITE NAME - PRODUCT ID
- ✓ WEBSITE NAME - PRODUCT ID
```

The check mark (✓) shows that the product is activated.

<br/>


Expand Down
36 changes: 22 additions & 14 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from typing import List
import threading
import logging.config
import logging
Expand All @@ -13,7 +12,7 @@ def main() -> None:
args = scraper.argparse_setup()

if args.clean_data:
scraper.clean_records_data()
scraper.clean_datapoints()

if args.visualize:
scraper.visualize_data(args.all, args.category, args.id, args.name, args.up_to_date, args.compare)
Expand All @@ -24,6 +23,12 @@ def main() -> None:
if args.add:
scraper.add_products(args.category, args.url)

if args.activate:
scraper.update_products_is_active_with_product_codes(args.id, True)

if args.deactivate:
scraper.update_products_is_active_with_product_codes(args.id, False)

if args.search:
scraper.search(args.search)

Expand All @@ -34,7 +39,7 @@ def main() -> None:
scrape()

if args.latest_datapoint:
scraper.print_latest_datapoints(args.name, args.id)
scraper.print_latest_datapoints(args.name, args.id, args.category)

if args.print_all_products:
scraper.print_all_products()
Expand All @@ -47,18 +52,17 @@ def scrape() -> None:
print("Scraping...")

request_delay = scraper.Config.get_request_delay()
products_df = scraper.Filemanager.get_products_data()
active_products = scraper.db.get_all_products(select_only_active=True)

# Create instances of class "Scraper"
products = [scraper.Scraper(category, url) for category, url in zip(products_df["category"], products_df["url"])]
products = scraper.Format.db_products_to_scrapers(active_products)

with alive_progress.alive_bar(len(products), title="Scraping") as bar:
# Scrape and save scraped data for each product (sequentially)
for product in products:
bar.text = f"-> {product.url}"
time.sleep(request_delay)
product.scrape_info()
product.save_info()
scraper.add_product.add_new_datapoint_with_scraper(product)
bar()


Expand All @@ -67,18 +71,21 @@ def scrape_with_threads() -> None:

request_delay = scraper.Config.get_request_delay()

products_df = scraper.Filemanager.get_products_data()
domain_grouped_products_df = scraper.get_products_df_grouped_by_domains(products_df)
grouped_products = scraper.get_products_grouped_by_domain(domain_grouped_products_df)
grouped_db_products = scraper.db.get_all_products_grouped_by_domains(select_only_active=True)
grouped_products: list[list[scraper.Scraper]] = []

for db_products in grouped_db_products:
products = scraper.Format.db_products_to_scrapers(db_products)
grouped_products.append(products)

grouped_scraper_threads: List[List[threading.Thread]] = []
grouped_scraper_threads: list[list[threading.Thread]] = []

# Create scraper threads and group by domain
for products in grouped_products.values():
for products in grouped_products:
scraper_threads = [threading.Thread(target=product.scrape_info) for product in products]
grouped_scraper_threads.append(scraper_threads)

products_flatten = [product for products in grouped_products.values() for product in products]
products_flatten = [product for products in grouped_products for product in products]

with alive_progress.alive_bar(len(products_flatten), title="Scraping with threads") as progress_bar:
# Create master threads to manage scraper threads sequentially for each domain
Expand All @@ -97,10 +104,11 @@ def scrape_with_threads() -> None:

# Save scraped data for each product (sequentially)
for product in products_flatten:
product.save_info()
scraper.add_product.add_new_datapoint_with_scraper(product)


if __name__ == "__main__":
scraper.db.create_db_and_tables()
logging.config.fileConfig(
fname=scraper.Filemanager.logging_ini_path,
defaults={"logfilename": scraper.Filemanager.logfile_path},
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
[tool.black]
line-length = 127

[tool.ruff]
line-length = 127

[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402"]
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
requests>=2.24.0
beautifulsoup4>=4.9.1
plotly>=4.12.0
pandas>=1.1.3
pytest>=7.1.2
pytest-mock>=3.8.2
alive-progress>=2.4.1
flake8>=6.0.0
sqlmodel>=0.0.8
7 changes: 4 additions & 3 deletions scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from .scrape import Scraper, start_threads_sequentially
from .arguments import argparse_setup
from .add_product import add_products
from .add_product import add_products, update_products_is_active_with_product_codes
from .filemanager import Filemanager, Config
from .visualize import visualize_data
from .clean_data import clean_records_data
from .clean_data import clean_datapoints
from .delete_data import delete
from .reset_data import reset
from .search_data import search
from .print_products import print_latest_datapoints, print_all_products
from .misc import get_products_df_grouped_by_domains, get_products_grouped_by_domain
from .format import Format
import scraper.database as db


__author__ = "Crinibus"
106 changes: 55 additions & 51 deletions scraper/add_product.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from typing import List
import logging
from datetime import datetime

import scraper.database as db
from scraper.exceptions import WebsiteNotSupported, URLMissingSchema
from scraper.format import Format
from scraper.scrape import Scraper
from scraper.filemanager import Filemanager
from scraper.domains import get_website_name, SUPPORTED_DOMAINS
from scraper.constants import URL_SCHEMES


def add_products(categories: List[str], urls: List[str]) -> None:
def add_products(categories: list[str], urls: list[str]) -> None:
for category, url in zip(categories, urls):
try:
add_product(category, url)
Expand All @@ -31,77 +33,79 @@ def add_product(category: str, url: str) -> None:
logger.info(f"Adding product with category '{category}' and url '{url}'")

new_product = Scraper(category, url)
new_product.scrape_info()
new_product_info = new_product.scrape_info()

product_in_db = db.get_product_by_product_code(new_product_info.id)

if not check_if_product_exists(new_product):
save_product(new_product)
if product_in_db is None:
add_new_product_to_db(new_product)
add_new_datapoint_with_scraper(new_product)
return

logger.info("Product with the same product code already exists in database")

if product_in_db.is_active:
print("Product with the same product code already exists in database and is active")
return

user_input = input(
"A product with the same name and from the same website already exist in your data, "
"do you want to override this product? (y/n) > "
"A product with the same product id already exist in the database but is not active, "
"do you want to activate it? (y/n) > "
)

if user_input.lower() in ("y", "yes"):
print("Overriding product...")
save_product(new_product)
print("Activating product...")
set_existing_product_is_active(product_in_db, True)
logger.info("Product has been activated")
else:
print("Product was not added nor overrided")
logger.info("Adding product cancelled")


def check_if_product_exists(product: Scraper) -> bool:
data = Filemanager.get_record_data()
print("Product has not been activated")
logger.info("Product not activated")

category = product.category
product_name = product.product_info.name
website_name = product.website_handler.website_name

try:
data[category][product_name][website_name]
except KeyError:
return False
def add_new_product_to_db(product: Scraper) -> None:
product_to_db = Format.scraper_to_db_product(product, True)
db.add(product_to_db)

return True

def add_new_datapoint_to_db(product_code: str, price: float, currency: str, date: str | None = None):
"""Parameter 'date' defaults to the date of today in the format: YYYY-MM-DD"""
if date is None:
date = datetime.today().strftime("%Y-%m-%d")

def save_product(product: Scraper) -> None:
add_product_to_records(product)

if not check_if_product_exists_csv(product):
Filemanager.add_product_to_csv(product.category, product.url, product.website_handler.get_short_url())

product.save_info()

new_datapoint = db.DataPoint(
product_code=product_code,
date=date,
price=price,
currency=currency,
)

def add_product_to_records(product: Scraper) -> None:
data = Filemanager.get_record_data()
db.add(new_datapoint)

category = product.category
product_name = product.product_info.name
website_name = product.website_handler.website_name

empty_product_dict = {website_name: {"info": {}, "datapoints": []}}
def add_new_datapoint_with_scraper(product: Scraper, date: str | None = None) -> None:
if not product.product_info or not product.product_info.valid:
print(f"Product info is not valid - category: '{product.category}' - url: {product.url}")
return

if not data.get(category):
data.update({category: {product_name: empty_product_dict}})
product_code = product.product_info.id
price = product.product_info.price
currency = product.product_info.currency

if data[category].get(product_name):
data[category][product_name].update(empty_product_dict)
else:
data[category].update({product_name: empty_product_dict})
add_new_datapoint_to_db(product_code, price, currency, date)

Filemanager.save_record_data(data)

def update_products_is_active_with_product_codes(product_codes: list[str], is_active: bool) -> None:
action = "Activating" if is_active else "Deactivating"

def check_if_product_exists_csv(product: Scraper) -> bool:
products_df = Filemanager.get_products_data()
for product_code in product_codes:
print(f"{action} {product_code}")
product = db.get_product_by_product_code(product_code)
set_existing_product_is_active(product, is_active)

for category, url in zip(products_df["category"], products_df["url"]):
if product.category.lower() == category.lower() and product.url == url:
return True

return False
def set_existing_product_is_active(product: db.Product, is_active: bool) -> None:
product.is_active = is_active
db.add(product)


def is_missing_url_schema(url: str) -> bool:
Expand Down
14 changes: 12 additions & 2 deletions scraper/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ def argparse_setup() -> argparse.Namespace:

parser.add_argument("-u", "--url", help="the url to the product", type=str, nargs="*", action="extend")

parser.add_argument("--activate", help="activate a product to be scraped", action="store_true")

parser.add_argument("--deactivate", help="deactivate a product to not be scraped", action="store_true")

parser.add_argument(
"-v",
"--visualize",
Expand Down Expand Up @@ -140,6 +144,12 @@ def validate_arguments(parser: argparse.ArgumentParser) -> argparse.Namespace:
if args.add and args.visualize:
parser.error("Cannot use --add and --visualize at the same time")

if args.activate and args.deactivate:
parser.error("Cannot use --activate and --deactivate at the same time")

if (args.activate or args.deactivate) and not args.id:
parser.error("When using --activate or --deactivate, then --id is required")

if args.delete:
if args.all and any([args.category, args.name, args.id]):
parser.error("When using --delete and --all, then using --category, --name or --id does nothing")
Expand All @@ -163,7 +173,7 @@ def validate_arguments(parser: argparse.ArgumentParser) -> argparse.Namespace:
)

if args.latest_datapoint:
if not args.name and not args.id:
parser.error("When using --latest-datapoint, then --name or --id is required")
if not any([args.name, args.id, args.category]):
parser.error("When using --latest-datapoint, then --name, --id or --category is required")

return args
Loading

0 comments on commit 964d546

Please sign in to comment.