arisingmedia-web-sops/wp-divi-pipeline-to-am-stack/scripts/extract_wpress.py

#!/usr/bin/env python3
"""Extract All-in-One WP Migration .wpress archive.

Usage:
    python3 extract_wpress.py <path/to/file.wpress> <output/directory>

The .wpress format is a sequential binary archive with 4377-byte headers:
    255 bytes  filename (null-padded)
     14 bytes  file size in bytes (ASCII digits, null-padded)
     12 bytes  mtime unix timestamp (ASCII digits, null-padded)
   4096 bytes  relative path (null-padded)
Followed immediately by the raw file bytes, then the next header.
"""
import os
import sys
import argparse
from pathlib import Path

HEADER_SIZE = 4377
NAME_LEN    = 255
SIZE_LEN    = 14
MTIME_LEN   = 12
PATH_LEN    = 4096


def _parse_int(b: bytes) -> int:
    s = b.split(b"\x00", 1)[0].decode(errors="replace").strip()
    return int(s) if s else 0


def _parse_str(b: bytes) -> str:
    return b.split(b"\x00", 1)[0].decode(errors="replace")


def extract(wpress_path: str, out_dir: str, verbose: bool = True) -> dict:
    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)
    count = 0
    total_bytes = 0
    skipped = 0

    with open(wpress_path, "rb") as f:
        while True:
            header = f.read(HEADER_SIZE)
            if not header or len(header) < HEADER_SIZE:
                break
            if header == b"\x00" * HEADER_SIZE:
                break

            name  = _parse_str(header[0:NAME_LEN])
            size  = _parse_int(header[NAME_LEN : NAME_LEN + SIZE_LEN])
            mtime = _parse_int(header[NAME_LEN + SIZE_LEN : NAME_LEN + SIZE_LEN + MTIME_LEN])
            path  = _parse_str(header[NAME_LEN + SIZE_LEN + MTIME_LEN : NAME_LEN + SIZE_LEN + MTIME_LEN + PATH_LEN])

            # Sanitise path traversal
            path = path.lstrip("/").lstrip("\\").lstrip(".")
            path = path.lstrip("/")

            dest_dir = out / path if path else out
            dest_dir.mkdir(parents=True, exist_ok=True)
            dest_file = dest_dir / name

            if not name:
                skipped += 1
                f.seek(size, 1)
                continue

            with open(dest_file, "wb") as o:
                remaining = size
                while remaining > 0:
                    chunk = f.read(min(65536, remaining))
                    if not chunk:
                        break
                    o.write(chunk)
                    remaining -= len(chunk)

            try:
                if mtime > 0:
                    os.utime(dest_file, (mtime, mtime))
            except Exception:
                pass

            count += 1
            total_bytes += size

            if verbose and count % 200 == 0:
                print(f"  [{count} files | {total_bytes / 1024 / 1024:.1f} MB extracted]", flush=True)

    result = {
        "files": count,
        "bytes": total_bytes,
        "mb": round(total_bytes / 1024 / 1024, 1),
        "skipped": skipped,
        "out_dir": str(out),
    }
    print(f"DONE: {count} files | {result['mb']} MB -> {out_dir} (skipped {skipped})")
    return result


def main():
    p = argparse.ArgumentParser(description="Extract .wpress archive")
    p.add_argument("wpress", help="Path to .wpress file")
    p.add_argument("outdir", help="Destination directory")
    p.add_argument("-q", "--quiet", action="store_true", help="Suppress progress output")
    args = p.parse_args()
    extract(args.wpress, args.outdir, verbose=not args.quiet)


if __name__ == "__main__":
    main()