2025-02-26 13:36:36 -05:00
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
import datetime
|
|
|
|
|
import argparse
|
|
|
|
|
import logging
|
|
|
|
|
import boto3
|
|
|
|
|
from botocore.client import Config as BotoConfig
|
|
|
|
|
from dotenv import load_dotenv
|
2025-08-18 21:28:42 -04:00
|
|
|
from access_tracker import AccessTracker
|
2025-02-26 13:36:36 -05:00
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
|
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
|
|
|
)
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"):
|
2025-08-18 21:28:42 -04:00
|
|
|
access_tracker = AccessTracker()
|
2025-02-26 13:36:36 -05:00
|
|
|
required_env_vars = [
|
|
|
|
|
'NICONICOGAY_S3_ACCESS_KEY',
|
|
|
|
|
'NICONICOGAY_S3_SECRET_KEY',
|
|
|
|
|
'NICONICOGAY_S3_BUCKET_NAME',
|
|
|
|
|
'NICONICOGAY_S3_REGION'
|
|
|
|
|
]
|
|
|
|
|
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
|
|
|
|
|
if missing_vars:
|
|
|
|
|
logger.error(f"Missing required environment variables: {', '.join(missing_vars)}")
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
s3_session = boto3.Session()
|
|
|
|
|
s3_client = s3_session.client(
|
|
|
|
|
's3',
|
|
|
|
|
aws_access_key_id=os.environ['NICONICOGAY_S3_ACCESS_KEY'],
|
|
|
|
|
aws_secret_access_key=os.environ['NICONICOGAY_S3_SECRET_KEY'],
|
|
|
|
|
region_name=os.environ['NICONICOGAY_S3_REGION'],
|
|
|
|
|
endpoint_url=f"https://{os.environ['NICONICOGAY_S3_REGION']}.digitaloceanspaces.com",
|
|
|
|
|
config=BotoConfig(s3={'addressing_style': 'virtual'}),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
bucket_name = os.environ['NICONICOGAY_S3_BUCKET_NAME']
|
2025-08-18 21:28:42 -04:00
|
|
|
cutoff_timestamp = datetime.datetime.now(datetime.timezone.utc).timestamp() - (days * 24 * 60 * 60)
|
2025-02-26 13:36:36 -05:00
|
|
|
paginator = s3_client.get_paginator('list_objects_v2')
|
|
|
|
|
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=f"{directory_prefix}/")
|
|
|
|
|
|
|
|
|
|
total_files = 0
|
|
|
|
|
objects_to_delete = []
|
2025-08-18 21:28:42 -04:00
|
|
|
access_times = access_tracker.get_all_access_times()
|
2025-02-26 13:36:36 -05:00
|
|
|
|
|
|
|
|
for page in page_iterator:
|
|
|
|
|
if 'Contents' not in page:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
for obj in page['Contents']:
|
|
|
|
|
total_files += 1
|
2025-08-18 21:28:42 -04:00
|
|
|
key = obj['Key'] # type: ignore
|
|
|
|
|
|
|
|
|
|
# Extract video_id from S3 key (e.g., "niconico/sm12345.mp4" -> "sm12345")
|
|
|
|
|
if key.startswith(f"{directory_prefix}/") and key.endswith('.mp4'):
|
|
|
|
|
video_id = key[len(f"{directory_prefix}/"):-4] # Remove prefix and .mp4 extension
|
|
|
|
|
|
|
|
|
|
last_access = access_times.get(video_id)
|
|
|
|
|
should_delete = False
|
|
|
|
|
|
|
|
|
|
if last_access is None:
|
|
|
|
|
# No access record - delete files that haven't been accessed since tracking started
|
|
|
|
|
# For safety, only delete files older than the cutoff date
|
|
|
|
|
if obj['LastModified'].timestamp() < cutoff_timestamp: # type: ignore
|
|
|
|
|
should_delete = True
|
|
|
|
|
logger.debug(f"Will delete {video_id}: no access record and file is old")
|
|
|
|
|
elif last_access < cutoff_timestamp:
|
|
|
|
|
# Has access record but last access was too long ago
|
|
|
|
|
should_delete = True
|
|
|
|
|
logger.debug(f"Will delete {video_id}: last accessed {(datetime.datetime.now().timestamp() - last_access) / (24*60*60):.1f} days ago")
|
|
|
|
|
|
|
|
|
|
if should_delete:
|
|
|
|
|
objects_to_delete.append({'Key': key})
|
|
|
|
|
# Remove the access record since we're deleting the file
|
|
|
|
|
if not dry_run:
|
|
|
|
|
access_tracker.remove_access_record(video_id)
|
2025-02-26 13:36:36 -05:00
|
|
|
|
|
|
|
|
if len(objects_to_delete) == 0:
|
|
|
|
|
logger.info("No files to delete")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
if dry_run:
|
|
|
|
|
logger.info(f"DRY RUN: Would delete {len(objects_to_delete)} out of {total_files} files")
|
|
|
|
|
else:
|
|
|
|
|
# Delete files in batches of 1000 (S3 limit?)
|
|
|
|
|
for i in range(0, len(objects_to_delete), 1000):
|
|
|
|
|
batch = objects_to_delete[i:i+1000]
|
|
|
|
|
s3_client.delete_objects(
|
|
|
|
|
Bucket=bucket_name,
|
|
|
|
|
Delete={'Objects': batch}
|
|
|
|
|
)
|
|
|
|
|
logger.info(f"Successfully deleted {len(objects_to_delete)} out of {total_files} files")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error: {e}")
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
load_dotenv()
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--dry-run",
|
|
|
|
|
action="store_true",
|
|
|
|
|
help="Show what would be deleted without actually deleting anything"
|
|
|
|
|
)
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
cleanup_old_files(dry_run=args.dry_run)
|