From 0181b7d8551084920a97a1c0487502b5119a2eef Mon Sep 17 00:00:00 2001 From: Spectre Date: Fri, 5 Sep 2025 17:53:48 +0200 Subject: [PATCH] add sqlite3 db to avoid useless downloads --- main.py | 86 +++++++++- requirements.txt | 402 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 479 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index df71a78..d68527f 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,7 @@ import fetcher as ft from urllib.request import urlretrieve import logging from logging.handlers import RotatingFileHandler +import sqlite3 # --- Configuration du logging --- logging.addLevelName(logging.DEBUG, "DÉBOGAGE") @@ -37,15 +38,83 @@ APP_KEY = os.environ["APP_KEY"] APP_SECRET = os.environ["APP_SECRET"] CONSUMER_KEY = os.environ["CONSUMER_KEY"] PATH_OVH = os.environ["OVH_PATH"] +DB_PATH = os.environ["DB_PATH"] YEAR = datetime.now().year # Année courante (int) +def get_conn(): + """ + Ouvre une connexion SQLite vers DB_PATH, crée la table 'bills' si nécessaire, puis retourne la connexion. + """ + try: + logger.debug("Ouverture de la connexion SQLite vers %s", DB_PATH) + conn = sqlite3.connect(DB_PATH) + logger.debug("Connexion établie, vérification/creation de la table 'bills'") + conn.execute(""" + CREATE TABLE IF NOT EXISTS bills ( + bill_id TEXT PRIMARY KEY, + bill_year INT + )""") + conn.commit() + logger.info("Base SQLite initialisée et table 'bills' disponible") + return conn + except Exception as e: + logger.exception("Erreur lors de l'initialisation de la base SQLite: %s", e) + raise + + +def add_entries_to_db(entries: list[tuple[str, int]], conn): + """ + Insère en lot des paires (bill_id, bill_year) dans la table 'bills' avec gestion de conflit sur bill_id. + """ + try: + logger.debug("Insertion batch dans 'bills': %d entrées", len(entries)) + conn.executemany( + """ + INSERT INTO bills (bill_id, bill_year) + VALUES (?, ?) + ON CONFLICT(bill_id) DO NOTHING + """, + entries, + ) + conn.commit() + logger.info("Insertion batch dans 'bills' validée") + except Exception as e: + logger.exception("Échec d'insertion batch dans 'bills': %s", e) + raise + + +def get_entries_from_db(conn) -> set[str]: + """ + Récupère l'ensemble des bill_id présents dans la table 'bills' et les retourne sous forme de set[str]. + """ + try: + logger.debug("Sélection des bill_id depuis 'bills'") + cursor = conn.execute("SELECT bill_id FROM bills") + rows = cursor.fetchall() + logger.info("Sélection terminée: %d bill_id récupérés", len(rows)) + return {row[0] for row in rows} + except Exception as e: + logger.exception("Échec de lecture des bill_id depuis 'bills': %s", e) + raise + + +def compare_db_to_data(db_data: set[str], data: list[str]) -> list[str]: + """ + Compare une collection d'identifiants 'data' à l'ensemble 'db_data' et retourne la liste des éléments absents de 'db_data'. + """ + missings_current_year = list() + for bill_id in data: + if bill_id not in db_data: + missings_current_year.append(bill_id) + return missings_current_year + + def indexer(ids: list[str]) -> list[str]: """ - Parcourt le répertoire de l'année courante et compare les factures déjà présentes - avec la liste d'IDs renvoyée par OVH. Ne conserve que les factures absentes - ET datées de l'année courante. + Parcourt le répertoire de l'année courante, filtre les factures déjà présentes localement, conserve les factures absentes datées de l'année courante, et enregistre en base celles qui appartiennent à une autre année. """ + conn = get_conn() logger.info("Indexation des factures pour l'année %s", YEAR) target_dir = f"{PATH_OVH}{YEAR}" try: @@ -54,10 +123,13 @@ def indexer(ids: list[str]) -> list[str]: logger.warning("Dossier %s inexistant, aucune facture locale", target_dir) ids_already_in = [] - missing = [x for x in ids if f"{x}.pdf" not in ids_already_in] + missing = compare_db_to_data( + get_entries_from_db(conn), [x for x in ids if f"{x}.pdf" not in ids_already_in] + ) logger.info("%d factures absentes détectées", len(missing)) result: list[str] = [] + not_valid_year: list[tuple[str, int]] = list() for bill_id in missing: try: meta = ft.fetch_invoice_content( @@ -67,12 +139,16 @@ def indexer(ids: list[str]) -> list[str]: consumer_key=CONSUMER_KEY, ) except Exception as e: - logger.error("Impossible de récupérer la méta pour %s : %s", bill_id, e) + logger.error("Impossible de récupérer le json pour %s : %s", bill_id, e) continue bill_year = datetime.fromisoformat(meta["date"]).year if bill_year == YEAR: result.append(bill_id) + else: + not_valid_year.append((bill_id, bill_year)) + add_entries_to_db(not_valid_year, conn) + logger.info(f"Ajouter {len(not_valid_year)} entrées a la base de donnée") logger.info("%d factures retenues pour téléchargement", len(result)) return result diff --git a/requirements.txt b/requirements.txt index 25c38fb..71a46d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,403 @@ -certifi==2025.8.3 -charset-normalizer==3.4.3 +about-time==4.2.1 +aiodns==3.3.0 +aiohappyeyeballs==2.4.4 +aiohttp==3.10.11 +aiosignal==1.4.0 +alive-progress==3.3.0 +altgraph==0.17.4 +amulet-core==1.9.29 +amulet-leveldb==1.0.2 +amulet-map-editor==0.10.42 +amulet-nbt==2.1.5 +annotated-types==0.7.0 +anvil-parser==0.9.0 +anyio==4.9.0 +appdirs==1.4.4 +argcomplete==3.6.2 +argon2-cffi==25.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==3.0.0 +async-lru==2.0.5 +asyncio-dgram==2.2.0 +attrs==25.1.0 +autocommand==2.2.2 +Automat==24.8.1 +autoslot==2022.12.1 +babel==2.17.0 +backoff==2.2.1 +bcc==0.33.0 +bcrypt==4.2.1 +Beaker==1.12.1 +beautifulsoup4==4.13.5 +beautifultable==1.1.0 +bidict==0.23.1 +black==25.1.0 +bleach==6.2.0 +blinker==1.9.0 +blivet==3.12.1 +blivet-gui==2.6.0 +boilerpy3==1.0.7 +Brlapi==0.8.6 +Brotli==1.1.0 +cattrs==25.1.1 +certifi==2022.12.7 +cffi==1.17.1 +chardet==5.2.0 +charset-normalizer==2.0.12 +cheroot==10.0.1 +CherryPy==18.10.0 +chess==1.11.2 +click==8.1.8 +cloudscraper==1.2.71 +colorama==0.4.6 +colorlog==6.9.0 +comm==0.2.2 +constantly==23.10.4 +construct==2.5.3 +contourpy==1.3.1 +crypt_r==3.13.1 +cryptography==44.0.0 +cson==0.8 +cssselect==1.2.0 +cupshelpers==1.0 +cycler==0.12.1 +dasbus==1.7 +dbus-fast==2.44.1 +dbus-python==1.3.2 +dbus_next==0.2.3 +debugpy==1.8.14 +decorator==5.2.1 +defusedxml==0.7.1 +deluge==2.2.0 +distro==1.9.0 +dnf==4.23.0 +dnspython==2.7.0 +docopt==0.6.2 +docstring_parser==0.17.0 +email_validator==2.2.0 +et_xmlfile==2.0.0 +evdev==1.9.1 +Events==0.5 +executing==2.2.0 +fastjsonschema==2.21.1 +fedora-third-party==0.10 +file-magic==0.4.0 +filelock==3.18.0 +filetype==1.2.0 +Flask==3.1.0 +Flask-Bcrypt==1.0.1 +Flask-Login==0.6.3 +Flask-SocketIO==5.5.1 +Flask-SQLAlchemy==3.1.1 +Flask-WTF==1.2.2 +fonttools==4.55.8 +fqdn==1.5.1 +fros==1.1 +frozendict==2.4.6 +frozenlist==1.5.0 +fsspec==2025.7.0 +future==1.0.0 +geographiclib==2.1 +GeoIP==1.3.2 +geopy==2.4.1 +ghunt==2.3.3 +git-filter-repo==2.47.0 +graphemeu==0.7.2 +greenlet==3.1.1 +gunicorn==23.0.0 +h11==0.16.0 +h2==4.3.0 +haystack-ai==2.16.1 +haystack-experimental==0.12.0 +hf-xet==1.1.5 +hpack==4.1.0 +httpcore==1.0.9 +httpx==0.27.2 +huggingface-hub==0.34.3 +humanize==4.12.0 +hyperframe==6.1.0 +hyperlink==21.0.0 +icmplib==3.0.4 idna==3.10 +ImageHash==4.3.2 +impacket==0.10.0 +importlib_metadata==8.6.1 +incremental==24.7.2 +inflect==7.5.0 +inflection==0.5.1 +inkex==1.4.0 +instaloader==4.14.2 +ipykernel==6.29.5 +ipython==9.3.0 +ipython_pygments_lexers==1.1.1 +iso639==0.1.4 +isoduration==20.11.0 +itsdangerous==2.2.0 +jaraco.classes==3.4.0 +jaraco.collections==5.2.1 +jaraco.context==6.0.1 +jaraco.functools==4.1.0 +jaraco.text==4.0.0 +jedi==0.19.2 +jeepney==0.8.0 +Jinja2==3.1.6 +jiter==0.10.0 +joblib==1.5.1 +json5==0.12.0 +jsonpickle==3.4.2 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +jupyter-events==0.12.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.3 +jupyter_core==5.8.1 +jupyter_server==2.16.0 +jupyter_server_terminals==0.5.3 +jupyterlab==4.4.3 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +keyring==25.6.0 +kiwisolver==1.4.8 +langtable==0.0.69 +lazy-imports==0.3.1 +ldap3==2.9.1 +ldapdomaindump==0.10.0 +libcomps==0.1.22 +libdnf==0.74.0 +libtorrent==2.0.11 +libvirt-python==11.0.0 +logging==0.4.9.6 +louis==3.33.0 +lutris==0.5.19 +lxml==5.3.2 +lz4==4.4.4 +Mako==1.2.3 +markdown-it-py==4.0.0 +MarkupSafe==3.0.2 +matplotlib==3.10.0 +matplotlib-inline==0.1.7 +maxminddb==2.8.2 +mcstatus==11.1.1 +mdurl==0.1.2 +meson==1.7.2 +minecraft-resource-pack==1.4.6 +minecraft_ping==0.0.4 +mistune==3.1.3 +moddb==0.12.0 +mopidy==4.0.0a4 +Mopidy-Iris==3.69.3 +more-itertools==10.5.0 +mpmath==1.3.0 +msgpack==1.1.0 +multidict==6.1.0 +mutagen==1.47.0 +mutf8==1.0.6 +mypy_extensions==1.1.0 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +NBT==1.5.1 +nest-asyncio==1.6.0 +netaddr==1.3.0 +networkx==3.4.2 +nftables==0.1 +notebook==7.4.3 +notebook_shim==0.2.4 +num2words==0.5.14 +numpy==1.26.4 oauthlib==3.3.1 +olefile==0.47 +openai==1.98.0 +openpyxl==3.1.5 +outcome==1.3.0.post0 +overrides==7.7.0 ovh==1.2.0 -python-dotenv==1.1.1 +packaging==24.2 +pandas==2.3.1 +pandocfilters==1.5.1 +parso==0.8.4 +Paste==3.10.1 +pathspec==0.12.1 +pefile==2024.8.26 +perf==0.1 +pexpect==4.9.0 +phonenumbers==9.0.13 +pid==2.2.3 +pillow==10.4.0 +pipx==1.7.1 +platformdirs==3.11.0 +ply==3.11 +portalocker==2.10.1 +portend==3.2.1 +posthog==6.3.1 +productmd==1.45 +prometheus_client==0.22.1 +prompt_toolkit==3.0.51 +prompthub-py==4.0.0 +protobuf==5.29.5 +proton-core==0.6.0 +proton-keyring-linux==0.2.0 +proton-vpn-api-core==0.45.6 +proton-vpn-daemon==0.12.0 +proton-vpn-gtk-app==4.10.0b0 +proton-vpn-lib==0.1.1 +proton-vpn-network-manager==0.12.15 +protonvpn_cli==2.2.12 +psutil==7.0.0 +ptyprocess==0.7.0 +publicsuffixlist==1.0.2.20250830 +pure_eval==0.2.3 +pwquality==1.4.5 +pyasn1==0.4.8 +pyasn1_modules==0.4.1 +PyAudio==0.2.13 +pycairo==1.25.1 +pycares==4.10.0 +pycparser==2.20 +pycrypto==2.6.1 +pycryptodomex==3.23.0 +pycups==2.0.4 +pydantic==1.10.22 +pydantic_core==2.33.2 +pyenchant==3.2.2 +pygame==2.6.1 +Pygments==2.19.0 +PyGObject==3.50.0 +pyinotify==0.9.6 +pyinstaller==6.11.1 +pyinstaller-hooks-contrib==2025.1 +pykickstart==3.62 +pykka==4.2.0 +PyMCTranslate==1.2.33 +pymunk==6.11.1 +PyMuPDF==1.26.4 +PyNaCl==1.5.0 +pynvim==0.5.2 +PyOpenGL==3.1.9 +pyOpenSSL==25.0.0 +pyparsing==3.2.1 +pyparted==3.13.0 +pypresence==4.3.0 +PyQt5==5.15.11 +PyQt5_sip==12.16.1 +PyRoxy @ git+https://github.com/MatrixTM/PyRoxy.git@ea0f88dbc0573292ba5672124f69e4e7a31b544d +pyserial==3.5 +PySocks==1.7.1 +python-augeas==1.2.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-engineio==4.11.2 +python-gnupg==0.5.4 +python-json-logger==3.3.0 +python-linux-procfs==0.7.3 +python-meh==0.52 +python-pam==2.0.2 +python-pptx==1.0.2 +python-ptrace==0.9.9 +python-socketio==5.12.1 +pythondialog==3.5.3 +pytz==2025.2 +pyudev==0.24.3 +PyWavelets==1.9.0 +pyxdg==0.27 +PyYAML==6.0.2 +pyynl @ file:///builddir/build/BUILD/kernel-6.16.3-build/kernel-6.16.3/linux-6.16.3-200.fc42.x86_64/tools/net/ynl +pyzmq==27.0.0 +quantulum3==0.9.2 +rank-bm25==0.2.2 +RapidFuzz==3.11.0 +referencing==0.36.2 +regex==2024.11.6 +rencode==1.0.6 +reportlab==4.4.3 requests==2.32.5 +requests-cache==0.9.8 +requests-file==2.0.0 +requests-ftp==0.3.1 +requests-futures==1.0.2 requests-oauthlib==2.0.0 -urllib3==2.5.0 +requests-toolbelt==1.0.0 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rich==13.9.4 +rich-argparse==1.7.1 +rpds-py==0.25.0 +rpm==4.20.1 +safetensors==0.5.3 +scikit-learn==1.7.1 +scipy==1.16.0 +scour==0.38.2 +SecretStorage==3.3.3 +secure==1.0.1 +selenium==4.34.0 +selinux @ file:///builddir/build/BUILD/libselinux-3.8-build/libselinux-3.8/src +Send2Trash==1.8.3 +sentry-sdk==2.21.0 +sepolicy @ file:///builddir/build/BUILD/policycoreutils-3.8-build/selinux-3.8/python/sepolicy +service-identity==24.2.0 +setools==4.5.1 +setuptools==80.9.0 +sherlock==0.4.1 +sherlock-project==0.15.0 +shtab==1.7.1 +simple-websocket==1.1.0 +simpleaudio==1.0.4 +simpleline==1.9.0 +six==1.17.0 +Slowloris==0.2.6 +sniffio==1.3.1 +sortedcontainers==2.4.0 +sos==4.10.0 +soupsieve==2.7 +speg==0.3 +SQLAlchemy==2.0.38 +sseclient-py==1.8.0 +stack-data==0.6.3 +stem==1.8.2 +sympy==1.13.3 +systemd-python==235 +tempora==5.8.1 +tenacity==9.1.2 +terminado==0.18.1 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +tinycss2==1.4.0 +tokenizers==0.21.4 +torch==2.7.1+cpu +torchaudio==2.7.1+cpu +torchvision==0.22.1+cpu +tornado==6.4.1 +tqdm==4.67.1 +traitlets==5.14.3 +transformers==4.54.1 +trio==0.30.0 +trio-websocket==0.12.2 +Twisted==24.11.0 +typeguard==4.4.4 +types-python-dateutil==2.9.0.20250516 +typing-inspection==0.4.1 +typing_extensions==4.14.1 +tzdata==2025.2 +uri-template==1.3.0 +url-normalize==2.2.1 +urllib3==1.26.20 +userpath==1.9.2 +wcwidth==0.2.13 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 +websockets==14.2 +Werkzeug==3.1.3 +wheel==0.45.1 +wsproto==1.2.0 +WTForms==3.2.1 +wxPython==4.2.3 +xkbregistry==0.3 +XlsxWriter==3.2.3 +yarl==1.13.1 +yt-dlp==2025.8.27 +zc.lockfile==3.0.post1 +zipp==3.21.0 +zope.interface==7.2