From 9e276e73105d45d9c119cb4aac985bc6bdb94c56 Mon Sep 17 00:00:00 2001 From: Deimos Date: Tue, 26 Mar 2019 19:20:26 -0600 Subject: [PATCH] Clean extra data associated with deleted users This updates the clean_private_data script to delete more data associated with users that deleted their accounts at least 30 days ago, including all of their votes, subscriptions, bookmarks, and notifications. --- tildes/scripts/clean_private_data.py | 49 ++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/tildes/scripts/clean_private_data.py b/tildes/scripts/clean_private_data.py index abf0ae9..4d81d6d 100644 --- a/tildes/scripts/clean_private_data.py +++ b/tildes/scripts/clean_private_data.py @@ -16,10 +16,17 @@ from sqlalchemy.orm.session import Session from sqlalchemy.sql.expression import text from tildes.lib.database import get_session_from_config -from tildes.models.comment import Comment +from tildes.models.comment import ( + Comment, + CommentBookmark, + CommentLabel, + CommentNotification, + CommentVote, +) +from tildes.models.group import GroupSubscription from tildes.models.log import Log -from tildes.models.topic import Topic, TopicVisit -from tildes.models.user import User +from tildes.models.topic import Topic, TopicBookmark, TopicVisit, TopicVote +from tildes.models.user import User, UserGroupSettings # sensitive data older than this should be removed @@ -49,6 +56,9 @@ class DataCleaner: self.db_session = db_session self.retention_cutoff = datetime.now() - retention_period + # set high timeout for this script, since cleanup can activate a lot of triggers + self.db_session.execute("SET statement_timeout TO '10min'") + def clean_all(self) -> None: """Call all the cleanup functions.""" logging.info(f"Cleaning up all data (retention cutoff {self.retention_cutoff})") @@ -60,6 +70,8 @@ class DataCleaner: self.clean_old_deleted_topics() self.clean_old_deleted_users() + self.clean_old_deleted_user_data() + def delete_old_log_entries(self) -> None: """Delete all log entries older than the retention cutoff. @@ -183,3 +195,34 @@ class DataCleaner: ) self.db_session.commit() logging.info(f"Cleaned {updated} old deleted users.") + + def clean_old_deleted_user_data(self) -> None: + """Clean additional data from deleted users (subscriptions, votes, etc.).""" + models_to_delete_from = [ + CommentBookmark, + CommentLabel, + CommentNotification, + CommentVote, + GroupSubscription, + TopicBookmark, + TopicVote, + UserGroupSettings, + ] + + user_id_subquery = ( + self.db_session.query(User.user_id) + .filter( + User.is_deleted == True, # noqa + User.deleted_time <= self.retention_cutoff, # type: ignore + ) + .subquery() + ) + + for model_cls in models_to_delete_from: + deleted = ( + self.db_session.query(model_cls) + .filter(model_cls.user_id.in_(user_id_subquery)) # type: ignore + .delete(synchronize_session=False) + ) + self.db_session.commit() + logging.info(f"Deleted {deleted} rows from {model_cls.__name__}.")