Code for How to Extract Metadata from Docx Files in Python Tutorial


View on Github

docs_metadata_extractor.py

import docx  # Import the docx library for working with Word documents.
from pprint import pprint  # Import the pprint function for pretty printing.

def extract_metadata(docx_file):
    doc = docx.Document(docx_file)  # Create a Document object from the Word document file.
    core_properties = doc.core_properties  # Get the core properties of the document.

    metadata = {}  # Initialize an empty dictionary to store metadata

    # Extract core properties
    for prop in dir(core_properties):  # Iterate over all properties of the core_properties object.
        if prop.startswith('__'):  # Skip properties starting with double underscores (e.g., __elenent). Not needed
            continue
        value = getattr(core_properties, prop)  # Get the value of the property.
        if callable(value):  # Skip callable properties (methods).
            continue
        if prop == 'created' or prop == 'modified' or prop == 'last_printed':  # Check for datetime properties.
            if value:
                value = value.strftime('%Y-%m-%d %H:%M:%S')  # Convert datetime to string format.
            else:
                value = None
        metadata[prop] = value  # Store the property and its value in the metadata dictionary.

    # Extract custom properties (if available).
    try:
        custom_properties = core_properties.custom_properties  # Get the custom properties (if available).
        if custom_properties:  # Check if custom properties exist.
            metadata['custom_properties'] = {}  # Initialize a dictionary to store custom properties.
            for prop in custom_properties:  # Iterate over custom properties.
                metadata['custom_properties'][prop.name] = prop.value  # Store the custom property name and value.
    except AttributeError:
        # Custom properties not available in this version.
        pass  # Skip custom properties extraction if the attribute is not available.

    return metadata  # Return the metadata dictionary.



docx_path = 'test.docx'  # Path to the Word document file.
metadata = extract_metadata(docx_path)  # Call the extract_metadata function.
pprint(metadata)  # Pretty print the metadata dictionary.