openedx · nedbat · Jul 19, 2013 · Jul 12, 2013 · Jul 15, 2013 · Jul 18, 2013
diff --git a/lms/djangoapps/courseware/management/commands/clean_history.py b/lms/djangoapps/courseware/management/commands/clean_history.py
@@ -0,0 +1,239 @@
+"""A command to clean the StudentModuleHistory table.
+
+When we added XBlock storage, each field modification wrote a new history row
+to the db.  Now that we have bulk saves to avoid that database hammering, we
+need to clean out the unnecessary rows from the database.
+
+This command that does that.
+
+"""
+
+import datetime
+import json
+import logging
+import optparse
+import time
+import traceback
+
+from django.core.management.base import NoArgsCommand
+from django.db import connection
+
+
+class Command(NoArgsCommand):
+    """The actual clean_history command to clean history rows."""
+
+    help = "Deletes unneeded rows from the StudentModuleHistory table."
+
+    option_list = NoArgsCommand.option_list + (
+        optparse.make_option(
+            '--batch',
+            type='int',
+            default=100,
+            help="Batch size, number of module_ids to examine in a transaction.",
+        ),
+        optparse.make_option(
+            '--dry-run',
+            action='store_true',
+            default=False,
+            help="Don't change the database, just show what would be done.",
+        ),
+        optparse.make_option(
+            '--sleep',
+            type='float',
+            default=0,
+            help="Seconds to sleep between batches.",
+        ),
+    )
+
+    def handle_noargs(self, **options):
+        # We don't want to see the SQL output from the db layer.
+        logging.getLogger("django.db.backends").setLevel(logging.INFO)
+
+        smhc = StudentModuleHistoryCleaner(
+            dry_run=options["dry_run"],
+        )
+        smhc.main(batch_size=options["batch"], sleep=options["sleep"])
+
+
+class StudentModuleHistoryCleaner(object):
+    """Logic to clean rows from the StudentModuleHistory table."""
+
+    DELETE_GAP_SECS = 0.5   # Rows this close can be discarded.
+    STATE_FILE = "clean_history.json"
+    BATCH_SIZE = 100
+
+    def __init__(self, dry_run=False):
+        self.dry_run = dry_run
+        self.next_student_module_id = 0
+        self.last_student_module_id = 0
+
+    def main(self, batch_size=None, sleep=0):
+        """Invoked from the management command to do all the work."""
+
+        batch_size = batch_size or self.BATCH_SIZE
+
+        connection.enter_transaction_management()
+
+        self.last_student_module_id = self.get_last_student_module_id()
+        self.load_state()
+
+        while self.next_student_module_id <= self.last_student_module_id:
+            for smid in self.module_ids_to_check(batch_size):
+                try:
+                    self.clean_one_student_module(smid)
+                except Exception:       # pylint: disable=W0703
+                    trace = traceback.format_exc()
+                    self.say("Couldn't clean student_module_id {}:\n{}".format(smid, trace))
+            if not self.dry_run:
+                self.commit()
+            self.save_state()
+            if sleep:
+                time.sleep(sleep)
+
+    def say(self, message):
+        """
+        Display a message to the user.
+
+        The message will have a trailing newline added to it.
+
+        """
+        print message
+
+    def commit(self):
+        """
+        Commit the transaction.
+        """
+        self.say("Committing")
+        connection.commit()
+
+    def load_state(self):
+        """
+        Load the latest state from disk.
+        """
+        try:
+            state_file = open(self.STATE_FILE)
+        except IOError:
+            self.say("No stored state")
+            self.next_student_module_id = 0
+        else:
+            with state_file:
+                state = json.load(state_file)
+            self.say(
+                "Loaded stored state: {}".format(
+                    json.dumps(state, sort_keys=True)
+                )
+            )
+            self.next_student_module_id = state['next_student_module_id']
+
+    def save_state(self):
+        """
+        Save the state to disk.
+        """
+        state = {
+            'next_student_module_id': self.next_student_module_id,
+        }
+        with open(self.STATE_FILE, "w") as state_file:
+            json.dump(state, state_file)
+        self.say("Saved state: {}".format(json.dumps(state, sort_keys=True)))
+
+    def get_last_student_module_id(self):
+        """
+        Return the id of the last student_module.
+        """
+        cursor = connection.cursor()
+        cursor.execute("""
+            SELECT max(student_module_id) FROM courseware_studentmodulehistory
+            """)
+        last = cursor.fetchone()[0]
+        self.say("Last student_module_id is {}".format(last))
+        return last
+
+    def module_ids_to_check(self, batch_size):
+        """Produce a sequence of student module ids to check.
+
+        `batch_size` is how many module ids to produce, max.
+
+        The sequence starts with `next_student_module_id`, and goes up to
+        and including `last_student_module_id`.
+
+        `next_student_module_id` is updated as each id is yielded.
+
+        """
+        start = self.next_student_module_id
+        for smid in range(start, start+batch_size):
+            if smid > self.last_student_module_id:
+                break
+            yield smid
+            self.next_student_module_id = smid+1
+
+    def get_history_for_student_modules(self, student_module_id):
+        """
+        Get the history rows for a student module.
+
+        ```student_module_id```: the id of the student module we're
+        interested in.
+
+        Return a list: [(id, created), ...], all the rows of history.
+
+        """
+        cursor = connection.cursor()
+        cursor.execute("""
+            SELECT id, created FROM courseware_studentmodulehistory
+            WHERE student_module_id = %s
+            ORDER BY created, id
+            """,
+            [student_module_id]
+        )
+        history = cursor.fetchall()
+        return history
+
+    def delete_history(self, ids_to_delete):
+        """
+        Delete history rows.
+
+        ```ids_to_delete```: a non-empty list (or set...) of history row ids to delete.
+
+        """
+        assert ids_to_delete
+        cursor = connection.cursor()
+        cursor.execute("""
+            DELETE FROM courseware_studentmodulehistory
+            WHERE id IN ({ids})
+            """.format(ids=",".join(str(i) for i in ids_to_delete))
+        )
+
+    def clean_one_student_module(self, student_module_id):
+        """Clean one StudentModule's-worth of history.
+
+        `student_module_id`: the id of the StudentModule to process.
+
+        """
+        delete_gap = datetime.timedelta(seconds=self.DELETE_GAP_SECS)
+
+        history = self.get_history_for_student_modules(student_module_id)
+        if not history:
+            self.say("No history for student_module_id {}".format(student_module_id))
+            return
+
+        ids_to_delete = []
+        next_created = None
+        for history_id, created in reversed(history):
+            if next_created is not None:
+                # Compare this timestamp with the next one.
+                if (next_created - created) < delete_gap:
+                    # This row is followed closely by another, we can discard
+                    # this one.
+                    ids_to_delete.append(history_id)
+
+            next_created = created
+
+        verb = "Would have deleted" if self.dry_run else "Deleting"
+        self.say("{verb} {to_delete} rows of {total} for student_module_id {id}".format(
+            verb=verb,
+            to_delete=len(ids_to_delete),
+            total=len(history),
+            id=student_module_id,
+        ))
+
+        if ids_to_delete and not self.dry_run:
+            self.delete_history(ids_to_delete)