Thursday, October 16, 2025

Read Microsoft Sensitivity Labels in Python

 DOCX:
#!/usr/bin/env python3

##python inspect_docx_label.py -v 9.pptx  

"""
Sensitivity Label Extractor for DOCX Files

This script extracts sensitivity label information from Microsoft Word DOCX files.
It searches through various XML components within the DOCX file to find label IDs
and names that may have been applied for information protection/classification.

Usage:
    python sensitivity_label_extractor.py <path_to_docx_file>
   
Author: Generated for document security analysis
"""

import zipfile
import re
import xml.etree.ElementTree as ET
import sys
import os
import argparse
from pathlib import Path

GUID_RE = r'[0-9A-Fa-f]{8}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{12}'

def extract_sensitivity_label_from_docx(path):
    """
    Return a dict with 'label_id' and/or 'label_name' when found, or None.
    This uses heuristics: looks in docProps/custom.xml and all XML/customXml parts
    for strings/attributes that mention 'label' / 'msip' / 'sensitivity'.
   
    Args:
        path (str): Path to the DOCX file
       
    Returns:
        dict or None: Dictionary containing label information or None if not found
    """
    # Validate input file
    if not os.path.exists(path):
        print(f"Error: File '{path}' does not exist.")
        return None
   
    if not path.lower().endswith('.docx'):
        print(f"Warning: File '{path}' does not have a .docx extension.")
   
    candidates = []

    try:
        with zipfile.ZipFile(path, 'r') as z:
            names = z.namelist()
            print(f"Analyzing DOCX file with {len(names)} internal components...")

            # 1) custom properties (docProps/custom.xml)
            if 'docProps/custom.xml' in names:
                print("  Checking custom properties...")
                try:
                    raw = z.read('docProps/custom.xml')
                    txt = raw.decode('utf-8', errors='ignore')
                    root = ET.fromstring(txt)
                    # iterate properties
                    for prop in root.findall('.//{http://schemas.openxmlformats.org/officeDocument/2006/custom-properties}property'):
                        name = prop.get('name', '')
                        # value is usually in the first child element (vt:lpwstr, etc.)
                        value = None
                        for child in list(prop):
                            value = child.text
                            break
                        if re.search(r'(label|sensitivity|msip|aip|compliance)', name, re.I) or re.search(r'(label|sensitivity|msip|aip|compliance)', str(value), re.I):
                            candidates.append({'source': 'docProps/custom.xml', 'name': name, 'value': value})
                            print(f"    Found potential label property: {name} = {value}")
                except Exception as e:
                    print(f"    Error reading custom properties: {e}")

            # 2) scan other XML parts (customXml, word/settings.xml, etc.)
            xml_files_checked = 0
            for name in names:
                if not (name.endswith('.xml') or name.endswith('.rels')):
                    continue
               
                xml_files_checked += 1
                try:
                    raw = z.read(name)
                except KeyError:
                    continue
               
                txt = raw.decode('utf-8', errors='ignore')
                if not re.search(r'(msip|sensitivity|label|aip|compliance|labelId|labelName)', txt, re.I):
                    continue

                print(f"  Checking {name} for sensitivity labels...")
               
                label_id = None
                # try to find GUIDs near label keywords
                guids = re.findall(GUID_RE, txt)
                if guids:
                    # pick the first GUID (heuristic)
                    label_id = guids[0]
                    print(f"    Found potential label ID: {label_id}")

                # try JSON-like or attribute name patterns for label name
                label_name = None
                m = re.search(r'(?:"labelName"|"label_name"|"displayName"|"Name")\s*[:=]\s*"([^"]{1,200})"', txt, re.I)
                if m:
                    label_name = m.group(1)
                    print(f"    Found potential label name: {label_name}")
                else:
                    # try XML parsing and look for elements/attributes whose local-name contains 'label' or 'sensitivity'
                    try:
                        root = ET.fromstring(txt)
                        for elem in root.iter():
                            tag = elem.tag
                            if '}' in tag:
                                tag = tag.split('}', 1)[1]
                            if re.search(r'(label|sensitivity|displayname|name)', tag, re.I):
                                if elem.text and elem.text.strip():
                                    label_name = elem.text.strip()
                                    print(f"    Found label name in element {tag}: {label_name}")
                                    break
                            # attributes
                            if elem.attrib:
                                for k, v in elem.attrib.items():
                                    if re.search(r'(name|displayname|label)', k, re.I) and v.strip():
                                        label_name = v.strip()
                                        print(f"    Found label name in attribute {k}: {label_name}")
                                        break
                                if label_name:
                                    break
                    except Exception as e:
                        # not well-formed XML — skip
                        print(f"    XML parsing error in {name}: {e}")

                if label_id or label_name:
                    candidates.append({
                        'source': name,
                        'label_id': label_id,
                        'label_name': label_name,
                        'snippet': txt[:1000]
                    })

            print(f"Checked {xml_files_checked} XML files in the document.")

    except zipfile.BadZipFile:
        print(f"Error: '{path}' is not a valid ZIP/DOCX file.")
        return None
    except Exception as e:
        print(f"Error reading file '{path}': {e}")
        return None

    # pick best candidate
    if not candidates:
        return None

    print(f"Found {len(candidates)} potential sensitivity label candidates.")

    # prefer entries that have both id and name
    for c in candidates:
        if c.get('label_id') and c.get('label_name'):
            return {'label_id': c['label_id'], 'label_name': c['label_name'], 'source': c['source']}

    for c in candidates:
        if c.get('label_name') or c.get('label_id'):
            return {'label_id': c.get('label_id'), 'label_name': c.get('label_name'), 'source': c['source']}

    return None


def main():
    """Main function to handle command line arguments and execute the extraction."""
    parser = argparse.ArgumentParser(
        description='Extract sensitivity labels from DOCX files',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
Examples:
  python sensitivity_label_extractor.py document.docx
  python sensitivity_label_extractor.py "C:\\path\\to\\my document.docx"
        '''
    )
    parser.add_argument('docx_file', help='Path to the DOCX file to analyze')
    parser.add_argument('-v', '--verbose', action='store_true',
                       help='Enable verbose output')

    args = parser.parse_args()

    if not args.verbose:
        # Suppress intermediate output for cleaner results
        import sys
        from io import StringIO
        old_stdout = sys.stdout
        sys.stdout = mystdout = StringIO()

    try:
        info = extract_sensitivity_label_from_docx(args.docx_file)
       
        if not args.verbose:
            sys.stdout = old_stdout
       
        if info:
            print('\n=== SENSITIVITY LABEL FOUND ===')
            if info.get('label_name'):
                print(f'Label Name: {info["label_name"]}')
            if info.get('label_id'):
                print(f'Label ID: {info["label_id"]}')
            print(f'Source: {info["source"]}')
            print('================================')
        else:
            print('\n=== NO SENSITIVITY LABEL FOUND ===')
            print('No sensitivity label detected in this document.')
            print('This could mean:')
            print('  - The document has no sensitivity label applied')
            print('  - The label is stored in an encrypted/proprietary format')
            print('  - The label is stored in a location not checked by this script')
            print('===================================')
           
    except KeyboardInterrupt:
        print("\nOperation cancelled by user.")
        sys.exit(1)
    except Exception as e:
        if not args.verbose:
            sys.stdout = old_stdout
        print(f"Error: {e}")
        sys.exit(1)


if __name__ == '__main__':
    # If no command line arguments, show example usage
    if len(sys.argv) == 1:
        print("Sensitivity Label Extractor for DOCX Files")
        print("==========================================")
        print()
        print("Usage:")
        print("  python sensitivity_label_extractor.py <path_to_docx_file>")
        print("  python sensitivity_label_extractor.py -h  (for help)")
        print()
        print("Example:")
        print("  python sensitivity_label_extractor.py example.docx")
        print()
       
        # Ask for file path interactively
        try:
            file_path = input("Enter path to DOCX file (or press Ctrl+C to exit): ").strip()
            if file_path:
                # Remove quotes if user added them
                file_path = file_path.strip('"\'')
                sys.argv = [sys.argv[0], file_path]
                main()
        except KeyboardInterrupt:
            print("\nExiting...")
            sys.exit(0)
    else:
        main()


PPTX:
#python inspect_pptx_label.py 9.pptx

import zipfile  
import xml.etree.ElementTree as ET  
import re  
import sys  
 
KEYWORDS = re.compile(r'msip|msip_label|sensitivity|sensitivitylabel|compliancetag|label|azureinformationprotection', re.I)  
 
def _first_child_text(elem):  
    children = list(elem)  
    if children:  
        return children[0].text  
    return elem.text  
 
def parse_custom_properties_xml(data):  
    """Return list of (property_name, value) from docProps/custom.xml content."""  
    props = []  
    try:  
        root = ET.fromstring(data)  
    except Exception:  
        # try forgiving decode  
        try:  
            root = ET.fromstring(data.decode('utf-8', errors='ignore'))  
        except Exception:  
            return props  
    for node in root.iter():  
        tag_local = node.tag.split('}', 1)[-1] if '}' in node.tag else node.tag  
        if tag_local.lower() == 'property':  
            name = node.attrib.get('name') or node.attrib.get('Name')  
            value = _first_child_text(node)  
            props.append((name, value))  
    return props  
 
def find_sensitivity_metadata_in_pptx(path):  
    if not zipfile.is_zipfile(path):  
        raise ValueError(f'{path} is not a zip/OOXML file (not a .pptx?)')  
 
    found = []  
    with zipfile.ZipFile(path, 'r') as z:  
        names = z.namelist()  
 
        # 1) docProps/custom.xml (common place for MSIP_Label properties)  
        if 'docProps/custom.xml' in names:  
            data = z.read('docProps/custom.xml')  
            for name, val in parse_custom_properties_xml(data):  
                if name and KEYWORDS.search(name) or (val and KEYWORDS.search(str(val))):  
                    found.append({'source': 'docProps/custom.xml', 'property': name, 'value': val})  
 
        # 2) customXml parts (sometimes label info is stored inside customXml)  
        for n in names:  
            if not n.startswith('customXml/') or n.endswith('/'):  
                continue  
            try:  
                data = z.read(n)  
            except Exception:  
                continue  
            text = None  
            try:  
                text = data.decode('utf-8', errors='ignore')  
            except Exception:  
                pass  
            if text and KEYWORDS.search(text):  
                # capture short snippets that match  
                snippets = re.findall(r'.{0,80}(?:msip|sensitivity|label|compliance).{0,80}', text, re.I)  
                found.append({'source': n, 'match_snippets': snippets[:5]})  
            else:  
                # try XML parse and inspect element names and text  
                try:  
                    root = ET.fromstring(data)  
                    for elem in root.iter():  
                        tag_local = elem.tag.split('}', 1)[-1] if '}' in elem.tag else elem.tag  
                        if KEYWORDS.search(tag_local) or (elem.text and KEYWORDS.search(elem.text)):  
                            found.append({'source': n, 'element_tag': tag_local, 'element_text': elem.text})  
                            break  
                except Exception:  
                    pass  
 
        # 3) fallback: scan other small xml files for keywords  
        for n in names:  
            if n.endswith('/') or n.startswith('ppt/media/') or n.startswith('ppt/embeddings/'):  
                continue  
            try:  
                data = z.read(n)  
            except Exception:  
                continue  
            # only check a chunk for big files  
            chunk = data[:200000] if len(data) > 200000 else data  
            try:  
                chunk_text = chunk.decode('utf-8', errors='ignore')  
            except Exception:  
                chunk_text = ''  
            if KEYWORDS.search(chunk_text):  
                snippets = re.findall(r'.{0,80}(?:msip|sensitivity|label|compliance).{0,80}', chunk_text, re.I)  
                found.append({'source': n, 'snippet': snippets[:3]})  
 
    # de-duplicate results by (source, property/element_tag)  
    uniq = []  
    seen = set()  
    for r in found:  
        key = (r.get('source'), r.get('property') or r.get('element_tag') or '')  
        if key in seen:  
            continue  
        seen.add(key)  
        uniq.append(r)  
    return uniq  
 
if __name__ == '__main__':  
    if len(sys.argv) < 2:  
        print("Usage: python inspect_pptx_label.py file.pptx")  
        sys.exit(1)  
    path = sys.argv[1]  
    try:  
        results = find_sensitivity_metadata_in_pptx(path)  
    except Exception as e:  
        print("Error:", e)  
        sys.exit(2)  
    if not results:  
        print("No label-like metadata found in the package. (It might be encrypted or stored elsewhere.)")  
    else:  
        print("Found label-like metadata:")  
        for r in results:  
            print("----")  
            for k, v in r.items():  
                print(f"{k}: {v}")  



No comments:

Post a Comment

Featured Post

Read Microsoft Sensitivity Labels in Python

 DOCX: #!/usr/bin/env python3 ##python inspect_docx_label.py -v 9.pptx   """ Sensitivity Label Extractor for DOCX Files This ...

Popular posts