DOCX:
#!/usr/bin/env python3
##python inspect_docx_label.py -v 9.pptx
"""
Sensitivity Label Extractor for DOCX Files
This script extracts sensitivity label information from Microsoft Word DOCX files.
It searches through various XML components within the DOCX file to find label IDs
and names that may have been applied for information protection/classification.
Usage:
python sensitivity_label_extractor.py <path_to_docx_file>
Author: Generated for document security analysis
"""
import zipfile
import re
import xml.etree.ElementTree as ET
import sys
import os
import argparse
from pathlib import Path
GUID_RE = r'[0-9A-Fa-f]{8}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{12}'
def extract_sensitivity_label_from_docx(path):
"""
Return a dict with 'label_id' and/or 'label_name' when found, or None.
This uses heuristics: looks in docProps/custom.xml and all XML/customXml parts
for strings/attributes that mention 'label' / 'msip' / 'sensitivity'.
Args:
path (str): Path to the DOCX file
Returns:
dict or None: Dictionary containing label information or None if not found
"""
# Validate input file
if not os.path.exists(path):
print(f"Error: File '{path}' does not exist.")
return None
if not path.lower().endswith('.docx'):
print(f"Warning: File '{path}' does not have a .docx extension.")
candidates = []
try:
with zipfile.ZipFile(path, 'r') as z:
names = z.namelist()
print(f"Analyzing DOCX file with {len(names)} internal components...")
# 1) custom properties (docProps/custom.xml)
if 'docProps/custom.xml' in names:
print(" Checking custom properties...")
try:
raw = z.read('docProps/custom.xml')
txt = raw.decode('utf-8', errors='ignore')
root = ET.fromstring(txt)
# iterate properties
for prop in root.findall('.//{http://schemas.openxmlformats.org/officeDocument/2006/custom-properties}property'):
name = prop.get('name', '')
# value is usually in the first child element (vt:lpwstr, etc.)
value = None
for child in list(prop):
value = child.text
break
if re.search(r'(label|sensitivity|msip|aip|compliance)', name, re.I) or re.search(r'(label|sensitivity|msip|aip|compliance)', str(value), re.I):
candidates.append({'source': 'docProps/custom.xml', 'name': name, 'value': value})
print(f" Found potential label property: {name} = {value}")
except Exception as e:
print(f" Error reading custom properties: {e}")
# 2) scan other XML parts (customXml, word/settings.xml, etc.)
xml_files_checked = 0
for name in names:
if not (name.endswith('.xml') or name.endswith('.rels')):
continue
xml_files_checked += 1
try:
raw = z.read(name)
except KeyError:
continue
txt = raw.decode('utf-8', errors='ignore')
if not re.search(r'(msip|sensitivity|label|aip|compliance|labelId|labelName)', txt, re.I):
continue
print(f" Checking {name} for sensitivity labels...")
label_id = None
# try to find GUIDs near label keywords
guids = re.findall(GUID_RE, txt)
if guids:
# pick the first GUID (heuristic)
label_id = guids[0]
print(f" Found potential label ID: {label_id}")
# try JSON-like or attribute name patterns for label name
label_name = None
m = re.search(r'(?:"labelName"|"label_name"|"displayName"|"Name")\s*[:=]\s*"([^"]{1,200})"', txt, re.I)
if m:
label_name = m.group(1)
print(f" Found potential label name: {label_name}")
else:
# try XML parsing and look for elements/attributes whose local-name contains 'label' or 'sensitivity'
try:
root = ET.fromstring(txt)
for elem in root.iter():
tag = elem.tag
if '}' in tag:
tag = tag.split('}', 1)[1]
if re.search(r'(label|sensitivity|displayname|name)', tag, re.I):
if elem.text and elem.text.strip():
label_name = elem.text.strip()
print(f" Found label name in element {tag}: {label_name}")
break
# attributes
if elem.attrib:
for k, v in elem.attrib.items():
if re.search(r'(name|displayname|label)', k, re.I) and v.strip():
label_name = v.strip()
print(f" Found label name in attribute {k}: {label_name}")
break
if label_name:
break
except Exception as e:
# not well-formed XML — skip
print(f" XML parsing error in {name}: {e}")
if label_id or label_name:
candidates.append({
'source': name,
'label_id': label_id,
'label_name': label_name,
'snippet': txt[:1000]
})
print(f"Checked {xml_files_checked} XML files in the document.")
except zipfile.BadZipFile:
print(f"Error: '{path}' is not a valid ZIP/DOCX file.")
return None
except Exception as e:
print(f"Error reading file '{path}': {e}")
return None
# pick best candidate
if not candidates:
return None
print(f"Found {len(candidates)} potential sensitivity label candidates.")
# prefer entries that have both id and name
for c in candidates:
if c.get('label_id') and c.get('label_name'):
return {'label_id': c['label_id'], 'label_name': c['label_name'], 'source': c['source']}
for c in candidates:
if c.get('label_name') or c.get('label_id'):
return {'label_id': c.get('label_id'), 'label_name': c.get('label_name'), 'source': c['source']}
return None
def main():
"""Main function to handle command line arguments and execute the extraction."""
parser = argparse.ArgumentParser(
description='Extract sensitivity labels from DOCX files',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''
Examples:
python sensitivity_label_extractor.py document.docx
python sensitivity_label_extractor.py "C:\\path\\to\\my document.docx"
'''
)
parser.add_argument('docx_file', help='Path to the DOCX file to analyze')
parser.add_argument('-v', '--verbose', action='store_true',
help='Enable verbose output')
args = parser.parse_args()
if not args.verbose:
# Suppress intermediate output for cleaner results
import sys
from io import StringIO
old_stdout = sys.stdout
sys.stdout = mystdout = StringIO()
try:
info = extract_sensitivity_label_from_docx(args.docx_file)
if not args.verbose:
sys.stdout = old_stdout
if info:
print('\n=== SENSITIVITY LABEL FOUND ===')
if info.get('label_name'):
print(f'Label Name: {info["label_name"]}')
if info.get('label_id'):
print(f'Label ID: {info["label_id"]}')
print(f'Source: {info["source"]}')
print('================================')
else:
print('\n=== NO SENSITIVITY LABEL FOUND ===')
print('No sensitivity label detected in this document.')
print('This could mean:')
print(' - The document has no sensitivity label applied')
print(' - The label is stored in an encrypted/proprietary format')
print(' - The label is stored in a location not checked by this script')
print('===================================')
except KeyboardInterrupt:
print("\nOperation cancelled by user.")
sys.exit(1)
except Exception as e:
if not args.verbose:
sys.stdout = old_stdout
print(f"Error: {e}")
sys.exit(1)
if __name__ == '__main__':
# If no command line arguments, show example usage
if len(sys.argv) == 1:
print("Sensitivity Label Extractor for DOCX Files")
print("==========================================")
print()
print("Usage:")
print(" python sensitivity_label_extractor.py <path_to_docx_file>")
print(" python sensitivity_label_extractor.py -h (for help)")
print()
print("Example:")
print(" python sensitivity_label_extractor.py example.docx")
print()
# Ask for file path interactively
try:
file_path = input("Enter path to DOCX file (or press Ctrl+C to exit): ").strip()
if file_path:
# Remove quotes if user added them
file_path = file_path.strip('"\'')
sys.argv = [sys.argv[0], file_path]
main()
except KeyboardInterrupt:
print("\nExiting...")
sys.exit(0)
else:
main()
PPTX:
#python inspect_pptx_label.py 9.pptx
import zipfile
import xml.etree.ElementTree as ET
import re
import sys
KEYWORDS = re.compile(r'msip|msip_label|sensitivity|sensitivitylabel|compliancetag|label|azureinformationprotection', re.I)
def _first_child_text(elem):
children = list(elem)
if children:
return children[0].text
return elem.text
def parse_custom_properties_xml(data):
"""Return list of (property_name, value) from docProps/custom.xml content."""
props = []
try:
root = ET.fromstring(data)
except Exception:
# try forgiving decode
try:
root = ET.fromstring(data.decode('utf-8', errors='ignore'))
except Exception:
return props
for node in root.iter():
tag_local = node.tag.split('}', 1)[-1] if '}' in node.tag else node.tag
if tag_local.lower() == 'property':
name = node.attrib.get('name') or node.attrib.get('Name')
value = _first_child_text(node)
props.append((name, value))
return props
def find_sensitivity_metadata_in_pptx(path):
if not zipfile.is_zipfile(path):
raise ValueError(f'{path} is not a zip/OOXML file (not a .pptx?)')
found = []
with zipfile.ZipFile(path, 'r') as z:
names = z.namelist()
# 1) docProps/custom.xml (common place for MSIP_Label properties)
if 'docProps/custom.xml' in names:
data = z.read('docProps/custom.xml')
for name, val in parse_custom_properties_xml(data):
if name and KEYWORDS.search(name) or (val and KEYWORDS.search(str(val))):
found.append({'source': 'docProps/custom.xml', 'property': name, 'value': val})
# 2) customXml parts (sometimes label info is stored inside customXml)
for n in names:
if not n.startswith('customXml/') or n.endswith('/'):
continue
try:
data = z.read(n)
except Exception:
continue
text = None
try:
text = data.decode('utf-8', errors='ignore')
except Exception:
pass
if text and KEYWORDS.search(text):
# capture short snippets that match
snippets = re.findall(r'.{0,80}(?:msip|sensitivity|label|compliance).{0,80}', text, re.I)
found.append({'source': n, 'match_snippets': snippets[:5]})
else:
# try XML parse and inspect element names and text
try:
root = ET.fromstring(data)
for elem in root.iter():
tag_local = elem.tag.split('}', 1)[-1] if '}' in elem.tag else elem.tag
if KEYWORDS.search(tag_local) or (elem.text and KEYWORDS.search(elem.text)):
found.append({'source': n, 'element_tag': tag_local, 'element_text': elem.text})
break
except Exception:
pass
# 3) fallback: scan other small xml files for keywords
for n in names:
if n.endswith('/') or n.startswith('ppt/media/') or n.startswith('ppt/embeddings/'):
continue
try:
data = z.read(n)
except Exception:
continue
# only check a chunk for big files
chunk = data[:200000] if len(data) > 200000 else data
try:
chunk_text = chunk.decode('utf-8', errors='ignore')
except Exception:
chunk_text = ''
if KEYWORDS.search(chunk_text):
snippets = re.findall(r'.{0,80}(?:msip|sensitivity|label|compliance).{0,80}', chunk_text, re.I)
found.append({'source': n, 'snippet': snippets[:3]})
# de-duplicate results by (source, property/element_tag)
uniq = []
seen = set()
for r in found:
key = (r.get('source'), r.get('property') or r.get('element_tag') or '')
if key in seen:
continue
seen.add(key)
uniq.append(r)
return uniq
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: python inspect_pptx_label.py file.pptx")
sys.exit(1)
path = sys.argv[1]
try:
results = find_sensitivity_metadata_in_pptx(path)
except Exception as e:
print("Error:", e)
sys.exit(2)
if not results:
print("No label-like metadata found in the package. (It might be encrypted or stored elsewhere.)")
else:
print("Found label-like metadata:")
for r in results:
print("----")
for k, v in r.items():
print(f"{k}: {v}")
No comments:
Post a Comment