From c18260ebcbe80993af6fd5e722507eae8428dd4b Mon Sep 17 00:00:00 2001 From: MMaker Date: Mon, 18 Aug 2025 21:28:42 -0400 Subject: [PATCH] Add access tracker for cleaning CDN --- .gitignore | 3 ++- access_tracker.py | 65 +++++++++++++++++++++++++++++++++++++++++++++++ app.py | 8 ++++++ clean.py | 32 ++++++++++++++++++++--- 4 files changed, 104 insertions(+), 4 deletions(-) create mode 100644 access_tracker.py diff --git a/.gitignore b/.gitignore index 6e36383..8bd1691 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ venv .venv __pycache__ cookies.txt -.env \ No newline at end of file +.env +access_times.json \ No newline at end of file diff --git a/access_tracker.py b/access_tracker.py new file mode 100644 index 0000000..22661ab --- /dev/null +++ b/access_tracker.py @@ -0,0 +1,65 @@ +import json +import os +import threading +import time +from typing import Dict, Optional +import logging + +logger = logging.getLogger(__name__) + +class AccessTracker: + """Tracks when video URLs are accessed, storing data in JSON file and keeping it in memory""" + + def __init__(self, json_file_path: str = "access_times.json"): + self.json_file_path = json_file_path + self.access_times: Dict[str, float] = {} + self.lock = threading.Lock() + self._load_from_file() + + def _load_from_file(self) -> None: + """Load access times from JSON file into memory""" + try: + if os.path.exists(self.json_file_path): + with open(self.json_file_path, 'r') as f: + self.access_times = json.load(f) + logger.info(f"Loaded {len(self.access_times)} access times from {self.json_file_path}") + else: + logger.info(f"Access times file {self.json_file_path} does not exist, starting fresh") + except Exception as e: + logger.error(f"Error loading access times from {self.json_file_path}: {e}") + self.access_times = {} + + def _save_to_file(self) -> None: + """Save current access times from memory to JSON file""" + try: + with open(self.json_file_path, 'w') as f: + json.dump(self.access_times, f, indent=2) + logger.debug(f"Saved {len(self.access_times)} access times to {self.json_file_path}") + except Exception as e: + logger.error(f"Error saving access times to {self.json_file_path}: {e}") + + def record_access(self, video_id: str) -> None: + """Record that a video was accessed at the current time""" + current_time = time.time() + with self.lock: + self.access_times[video_id] = current_time + self._save_to_file() + logger.debug(f"Recorded access for {video_id} at {current_time}") + + def get_last_access(self, video_id: str) -> Optional[float]: + """Get the last access time for a video (returns None if never accessed)""" + with self.lock: + return self.access_times.get(video_id) + + def get_all_access_times(self) -> Dict[str, float]: + """Get a copy of all access times""" + with self.lock: + return self.access_times.copy() + + def remove_access_record(self, video_id: str) -> None: + """Remove access record for a video (e.g., when video is deleted)""" + with self.lock: + if video_id in self.access_times: + del self.access_times[video_id] + self._save_to_file() + logger.debug(f"Removed access record for {video_id}") diff --git a/app.py b/app.py index a9b1c1a..a5de1eb 100644 --- a/app.py +++ b/app.py @@ -17,6 +17,7 @@ from botocore.client import Config as BotoConfig import urllib.parse from dotenv import load_dotenv +from access_tracker import AccessTracker load_dotenv() logging.basicConfig( @@ -83,6 +84,8 @@ download_tracker = { download_lock = threading.Lock() download_queue = [] +access_tracker = AccessTracker() + def download_and_upload_video(video_id, url, video_quality): try: with download_lock: @@ -411,6 +414,11 @@ if you want to download videos, please consider using a tool like nndownload: ht logger.info(f"{video_id}: Caching HTML response") cache.set(f"{video_id}{cache_html_suffix}", html_response, expire=CACHE_EXPIRATION_HTML) + # Record access time for CDN cleanup purposes + if is_video_in_cdn(video_id): + access_tracker.record_access(video_id) + logger.debug(f"{video_id}: Recorded access time for CDN tracking") + logger.info(f"{video_id}: Returning response") logger.debug(f"{video_id}: HTML response:\n----------\n{html_response}\n----------") return Response(html_response, mimetype="text/html") diff --git a/clean.py b/clean.py index e20faa0..cfbd7c4 100644 --- a/clean.py +++ b/clean.py @@ -6,6 +6,7 @@ import logging import boto3 from botocore.client import Config as BotoConfig from dotenv import load_dotenv +from access_tracker import AccessTracker logging.basicConfig( level=logging.INFO, @@ -15,6 +16,7 @@ logging.basicConfig( logger = logging.getLogger(__name__) def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"): + access_tracker = AccessTracker() required_env_vars = [ 'NICONICOGAY_S3_ACCESS_KEY', 'NICONICOGAY_S3_SECRET_KEY', @@ -38,12 +40,13 @@ def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"): ) bucket_name = os.environ['NICONICOGAY_S3_BUCKET_NAME'] - cutoff_date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days) + cutoff_timestamp = datetime.datetime.now(datetime.timezone.utc).timestamp() - (days * 24 * 60 * 60) paginator = s3_client.get_paginator('list_objects_v2') page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=f"{directory_prefix}/") total_files = 0 objects_to_delete = [] + access_times = access_tracker.get_all_access_times() for page in page_iterator: if 'Contents' not in page: @@ -51,8 +54,31 @@ def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"): for obj in page['Contents']: total_files += 1 - if obj['LastModified'] < cutoff_date: # type: ignore - objects_to_delete.append({'Key': obj['Key']}) # type: ignore + key = obj['Key'] # type: ignore + + # Extract video_id from S3 key (e.g., "niconico/sm12345.mp4" -> "sm12345") + if key.startswith(f"{directory_prefix}/") and key.endswith('.mp4'): + video_id = key[len(f"{directory_prefix}/"):-4] # Remove prefix and .mp4 extension + + last_access = access_times.get(video_id) + should_delete = False + + if last_access is None: + # No access record - delete files that haven't been accessed since tracking started + # For safety, only delete files older than the cutoff date + if obj['LastModified'].timestamp() < cutoff_timestamp: # type: ignore + should_delete = True + logger.debug(f"Will delete {video_id}: no access record and file is old") + elif last_access < cutoff_timestamp: + # Has access record but last access was too long ago + should_delete = True + logger.debug(f"Will delete {video_id}: last accessed {(datetime.datetime.now().timestamp() - last_access) / (24*60*60):.1f} days ago") + + if should_delete: + objects_to_delete.append({'Key': key}) + # Remove the access record since we're deleting the file + if not dry_run: + access_tracker.remove_access_record(video_id) if len(objects_to_delete) == 0: logger.info("No files to delete")