Quantcast
Channel: Backup and restore
Viewing all articles
Browse latest Browse all 6815

Extract files from a Moodle .mbz backup

$
0
0
by John Pye.  

In case it is useful, here is a Python script for 'manually' extracting all of the embedded files in a downloaded course backup .mbz. This is useful if you want to get out all of the various uploaded files such as lecture slides, course handouts, spreadhseets etc that were uploaded and provided to students. The script does a bit of work to make sure the extracted files have the correct names, by reading the filenames from the 'files.xml' file and extracting the relevant files from the files subfolder (since they are named there by their hash, not by their originally uploaded filename). The script also directly accesses the .mbz file, you don't need to untar it first.

import os
import sys
import xml.etree.ElementTree as ET
import tarfile

def main(mbz_file_path):
    # Check if the file exists
    if not os.path.isfile(mbz_file_path):
        print(f"File not found: {mbz_file_path}")
        return

    output_dir = 'extracted_files'

    # Create the output directory if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Open the .mbz file
    with tarfile.open(mbz_file_path, 'r:gz') as tar:
        # Extract files.xml to memory
        files_xml = None
        for member in tar.getmembers():
            if member.name.endswith('files.xml'):
                files_xml = tar.extractfile(member).read()
                break

        if files_xml is None:
            raise FileNotFoundError('files.xml not found in the .mbz archive')

        # Parse the files.xml file
        root = ET.fromstring(files_xml)

        # Extract and rename files based on XML content
        for file in root.findall('file'):
            contenthash = file.find('contenthash').text
            filename = file.find('filename').text
            mimetype = file.find('mimetype').text
            filepath = file.find('filepath').text.strip('/')

            # Skip files with a filename of "." or without a MIME type
            if filename == "." or mimetype is None:
                continue

            # Construct the source file path based on the first two characters of the hash
            source_path = os.path.join("files",contenthash[:2], contenthash)

            # Construct the output file path
            output_file_path = os.path.join(output_dir, filepath, filename)

            # Create necessary directories
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

            # Extract the file from the tar archive and write it to the output path
            try:
                file_member = tar.getmember(source_path)
                file_content = tar.extractfile(file_member).read()
                with open(output_file_path, 'wb') as output_file:
                    output_file.write(file_content)
                print(f'Extracted: {output_file_path}')
            except KeyError:
                print(f'Missing file: {source_path} ({filename}, {mimetype})')
                # Debug information
                print("member list...")
                for member in tar.getmembers():
                    print(f" -- {member.name}")
                    if member.name.startswith(contenthash[:2]):
                        print(f"Found related file in tar: {member.name}")
                raise RuntimeError("Failed file extraction");

    print('Extraction complete.')

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print("Usage: python extract_files_from_mbz.py ")
    else:
        main(sys.argv[1])


Viewing all articles
Browse latest Browse all 6815

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>