From cda5f7c27870868e0ea01924e91cbdfd7d397147 Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Fri, 5 Jan 2024 10:55:53 -0700 Subject: [PATCH 1/3] quick_mounts_purge --- qiita_db/test/test_util.py | 6 ++++ qiita_db/util.py | 71 +++++++++++++++++++++++++++++++++++++- scripts/qiita-cron-job | 8 ++++- 3 files changed, 83 insertions(+), 2 deletions(-) diff --git a/qiita_db/test/test_util.py b/qiita_db/test/test_util.py index ef532ce2b..20b3d78d6 100644 --- a/qiita_db/test/test_util.py +++ b/qiita_db/test/test_util.py @@ -1291,6 +1291,12 @@ def test_purge_filepaths_test(self): fps_viewed = self._get_current_filepaths() self.assertCountEqual(fps_expected, fps_viewed) + def test_quick_mounts_purge(self): + obs = qdb.util.quick_mounts_purge() + exp = ('----------------------\nTotal files 0 0 Bytes\n------------' + '----------') + self.assertEqual(obs, exp) + STUDY_INFO = { 'study_id': 1, diff --git a/qiita_db/util.py b/qiita_db/util.py index 06ed3d417..641d4f441 100644 --- a/qiita_db/util.py +++ b/qiita_db/util.py @@ -49,13 +49,14 @@ from bcrypt import hashpw, gensalt from functools import partial from os.path import join, basename, isdir, exists, getsize -from os import walk, remove, listdir, rename +from os import walk, remove, listdir, rename, stat from glob import glob from shutil import move, rmtree, copy as shutil_copy from openpyxl import load_workbook from tempfile import mkstemp from csv import writer as csv_writer from datetime import datetime +from time import time as now from itertools import chain from contextlib import contextmanager import h5py @@ -896,6 +897,74 @@ def purge_filepaths(delete_files=True): qdb.sql_connection.TRN.execute() +def quick_mounts_purge(): + r"""This is a quick mount purge as it only slightly relies on the database + + Notes + ----- + Currenlty we delete anything older than 30 days that is not linked + to the database. This number is intentionally hardcoded in the code. + At time of writting this number seem high but keeping it this way to be + safe. In the future, if needed, it can be changes. + """ + with qdb.sql_connection.TRN: + main_sql = """SELECT data_directory_id FROM qiita.artifact_type at + LEFT JOIN qiita.data_directory dd ON ( + dd.data_type = at.artifact_type) + WHERE subdirectory = true""" + qdb.sql_connection.TRN.add(main_sql) + mp_ids = qdb.sql_connection.TRN.execute_fetchflatten() + mounts = [qdb.util.get_mountpoint_path_by_id(x) for x in mp_ids] + folders = [join(x, f) for x in mounts for f in listdir(x) + if f.isnumeric()] + + # getting all unlinked folders + to_delete = [] + for i, f in enumerate(folders): + vals = f.split('/') + aid = int(vals[-1]) + artifact_type = vals[-2] + if artifact_type == 'FeatureData[Taxonomy]': + continue + + try: + a = qdb.artifact.Artifact(aid) + except qdb.exceptions.QiitaDBUnknownIDError: + to_delete.append(f) + continue + if not a.artifact_type.startswith(artifact_type): + raise ValueError('Review artifact type: ' + f'{a.id} {artifact_type} {a.artifact_type}') + + # now, let's just keep those older than 30 days (in seconds) + ignore = now() - (30*86400) + to_keep = [x for x in to_delete if stat(x).st_mtime >= ignore] + to_delete = set(to_delete) - set(to_keep) + + # get stats to report + stats = dict() + for td in to_delete: + f = td.split('/')[-2] + if f not in stats: + stats[f] = 0 + stats[f] += sum([getsize(join(p, fp)) for p, ds, fs in walk(td) + for fp in fs]) + + report = ['----------------------'] + for f, s in stats.items(): + report.append(f'{f}\t{naturalsize(s)}') + report.append( + f'Total files {len(to_delete)} {naturalsize(sum(stats.values()))}') + report.append('----------------------') + + for td in list(to_delete): + if not exists(td): + continue + rmtree(td) + + return '\n'.join(report) + + def _rm_exists(fp, obj, _id, delete_files): try: _id = int(_id) diff --git a/scripts/qiita-cron-job b/scripts/qiita-cron-job index 825791afc..80d75d479 100755 --- a/scripts/qiita-cron-job +++ b/scripts/qiita-cron-job @@ -12,7 +12,8 @@ import click from qiita_db.util import ( purge_filepaths as qiita_purge_filepaths, - empty_trash_upload_folder as qiita_empty_trash_upload_folder) + empty_trash_upload_folder as qiita_empty_trash_upload_folder, + quick_mounts_purge as qiita_quick_mounts_purge) from qiita_db.meta_util import ( update_redis_stats as qiita_update_redis_stats, generate_biom_and_metadata_release as @@ -62,5 +63,10 @@ def generate_plugin_releases(): qiita_generate_plugin_releases() +@commands.command() +def quick_mounts_purge(): + print(qiita_quick_mounts_purge()) + + if __name__ == "__main__": commands() From 0ae8c763a046149fb63505ea9d0561d88105f4ea Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Fri, 5 Jan 2024 11:26:53 -0700 Subject: [PATCH 2/3] self.assertRaises --- qiita_db/test/test_util.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/qiita_db/test/test_util.py b/qiita_db/test/test_util.py index 20b3d78d6..112cb3e6b 100644 --- a/qiita_db/test/test_util.py +++ b/qiita_db/test/test_util.py @@ -1292,10 +1292,10 @@ def test_purge_filepaths_test(self): self.assertCountEqual(fps_expected, fps_viewed) def test_quick_mounts_purge(self): - obs = qdb.util.quick_mounts_purge() - exp = ('----------------------\nTotal files 0 0 Bytes\n------------' - '----------') - self.assertEqual(obs, exp) + # one of the tests creates a conflicting artifact_type so this test + # will always raise this ValueError + with self.assertRaises(ValueError): + qdb.util.quick_mounts_purge() STUDY_INFO = { From e2cefe7e9401e599172ac09dfce35c89e1922c7b Mon Sep 17 00:00:00 2001 From: Antonio Gonzalez Date: Fri, 5 Jan 2024 12:31:41 -0700 Subject: [PATCH 3/3] address @charles-cowart comments --- qiita_db/util.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/qiita_db/util.py b/qiita_db/util.py index 641d4f441..df7153bb4 100644 --- a/qiita_db/util.py +++ b/qiita_db/util.py @@ -902,10 +902,10 @@ def quick_mounts_purge(): Notes ----- - Currenlty we delete anything older than 30 days that is not linked + Currently we delete anything older than 30 days that is not linked to the database. This number is intentionally hardcoded in the code. - At time of writting this number seem high but keeping it this way to be - safe. In the future, if needed, it can be changes. + At the time of this writing this number seem high but keeping it + this way to be safe. In the future, if needed, it can be changed. """ with qdb.sql_connection.TRN: main_sql = """SELECT data_directory_id FROM qiita.artifact_type at @@ -958,9 +958,8 @@ def quick_mounts_purge(): report.append('----------------------') for td in list(to_delete): - if not exists(td): - continue - rmtree(td) + if exists(td): + rmtree(td) return '\n'.join(report)