import os import random import zipfile from zipfile import ZipFile class WordCorruptor: def (self, input_path, output_path, intensity=0.01): self.input_path = input_path self.output_path = output_path self.intensity = intensity # % of bytes to corrupt
Abstract The intentional corruption of Microsoft Word files serves various legitimate purposes, including software robustness testing, digital forensics training, and data recovery research. This paper examines the structural vulnerabilities of the DOC and DOCX formats, proposes a systematic methodology for generating controlled corruptions, and evaluates the practical applications of such generators. We classify corruption techniques into binary-level, XML-level, and ZIP-structure attacks, providing a technical reference for researchers and security analysts. 1. Introduction Microsoft Word files, whether legacy DOC (OLE Compound File) or modern DOCX (ZIP container with XML), possess predictable internal structures. A corrupt file generator deliberately alters bytes, headers, or references to render the file unreadable by standard word processors—or to simulate real-world data degradation. generador de archivos corruptos word
def corrupt_docx_zip(self): with ZipFile(self.input_path, 'r') as zin: files = zin.namelist() # Corrupt central directory by writing random bytes at end with open(self.output_path, 'wb') as fout: with open(self.input_path, 'rb') as fin: fout.write(fin.read()) # Append garbage after ZIP end signature fout.write(b'CORRUPTED_BY_GENERATOR\x00' * 10) import os import random import zipfile from zipfile
def corrupt_binary(self): with open(self.input_path, 'rb') as f: data = bytearray(f.read()) num_changes = max(1, int(len(data) * self.intensity)) for _ in range(num_changes): pos = random.randint(0, len(data)-1) data[pos] = random.randint(0, 255) with open(self.output_path, 'wb') as f: f.write(data) def corrupt_docx_zip(self): with ZipFile(self