by John Pye.
In case it is useful, here is a Python script for 'manually' extracting all of the embedded files in a downloaded course backup .mbz. This is useful if you want to get out all of the various uploaded files such as lecture slides, course handouts, spreadhseets etc that were uploaded and provided to students. The script does a bit of work to make sure the extracted files have the correct names, by reading the filenames from the 'files.xml' file and extracting the relevant files from the files subfolder (since they are named there by their hash, not by their originally uploaded filename). The script also directly accesses the .mbz file, you don't need to untar it first.
import os
import sys
import xml.etree.ElementTree as ET
import tarfile
def main(mbz_file_path):
# Check if the file exists
if not os.path.isfile(mbz_file_path):
print(f"File not found: {mbz_file_path}")
return
output_dir = 'extracted_files'
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Open the .mbz file
with tarfile.open(mbz_file_path, 'r:gz') as tar:
# Extract files.xml to memory
files_xml = None
for member in tar.getmembers():
if member.name.endswith('files.xml'):
files_xml = tar.extractfile(member).read()
break
if files_xml is None:
raise FileNotFoundError('files.xml not found in the .mbz archive')
# Parse the files.xml file
root = ET.fromstring(files_xml)
# Extract and rename files based on XML content
for file in root.findall('file'):
contenthash = file.find('contenthash').text
filename = file.find('filename').text
mimetype = file.find('mimetype').text
filepath = file.find('filepath').text.strip('/')
# Skip files with a filename of "." or without a MIME type
if filename == "." or mimetype is None:
continue
# Construct the source file path based on the first two characters of the hash
source_path = os.path.join("files",contenthash[:2], contenthash)
# Construct the output file path
output_file_path = os.path.join(output_dir, filepath, filename)
# Create necessary directories
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
# Extract the file from the tar archive and write it to the output path
try:
file_member = tar.getmember(source_path)
file_content = tar.extractfile(file_member).read()
with open(output_file_path, 'wb') as output_file:
output_file.write(file_content)
print(f'Extracted: {output_file_path}')
except KeyError:
print(f'Missing file: {source_path} ({filename}, {mimetype})')
# Debug information
print("member list...")
for member in tar.getmembers():
print(f" -- {member.name}")
if member.name.startswith(contenthash[:2]):
print(f"Found related file in tar: {member.name}")
raise RuntimeError("Failed file extraction");
print('Extraction complete.')
if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: python extract_files_from_mbz.py ")
else:
main(sys.argv[1])