#!/usr/bin/env python3
"""
Aumentum Browser Access Service
Combines database lookup with PDF conversion for browser-based document viewing.

This service allows third-party applications and browser extensions to:
1. Lookup property documents from Aumentum database
2. Convert .bin files to PDF for browser viewing
3. Retrieve property metadata and associated documents

Key Features:
- Connects to Aumentum MSSQL database (LRS43)
- Converts LEADTOOLS .bin files to PDF on-demand
- Provides REST API for document access
- Serves PDFs for browser viewing
"""

import os
import sys
import shutil

# Load .env for production environment variables (before any config is read)
try:
    from dotenv import load_dotenv
    load_dotenv()  # Load from .env in project root
except ImportError:
    pass  # python-dotenv not installed, use system env vars only
from pathlib import Path
from typing import Optional, Dict, List, Tuple
from PIL import Image
import tempfile
from pathlib import Path, PureWindowsPath, PurePosixPath

# Conditional import for pyodbc (only needed for MSSQL)
_pyodbc_available = None
def _get_pyodbc():
    """Lazy import of pyodbc - only when needed for MSSQL connections."""
    global _pyodbc_available
    if _pyodbc_available is None:
        try:
            import pyodbc
            _pyodbc_available = pyodbc
        except ImportError as e:
            _pyodbc_available = False
            raise ImportError(f"pyodbc not available (needed for MSSQL): {e}")
    if _pyodbc_available is False:
        raise ImportError("pyodbc not available (needed for MSSQL)")
    return _pyodbc_available

# try:
#     import pymssql
#     USE_PYMSSQL = True
# except ImportError:
#     import pyodbc
#     USE_PYMSSQL = False


class AumentumBrowserService:
    """Service for accessing and converting Aumentum documents for browser viewing."""
    
    def __init__(self, db_config: dict, contentstore_base: str = ""):
        """
        Initialize the service with database connection and contentstore path.
        
        Args:
            db_config: MSSQL connection parameters via ODBC
            contentstore_base: Base path to contentstore directory
        """
        self.db_config = db_config
        self.contentstore_base = contentstore_base
        
        # JPEG magic bytes
        self.JPEG_SIGNATURES = [
            b"\xFF\xD8\xFF\xE0", 
            b"\xFF\xD8\xFF\xE1", 
            b"\xFF\xD8\xFF\xE8",
            b"\xFF\xD8\xFF\xE2"  # Added alternative JPEG variant
        ]
    
    def _get_db_connection(self):
        """Return database connection (MSSQL or MySQL). Logs connection attempts."""
        db_type = self.db_config.get("type", "mssql").lower()
        
        if db_type == "mysql":
            return self._get_mysql_connection()
        else:
            return self._get_mssql_connection()
    
    def _get_mysql_connection(self):
        """Return MySQL connection. Logs connection attempts."""
        mysql_driver = None
        try:
            import pymysql
            mysql_driver = pymysql
            driver_name = "pymysql"
        except ImportError:
            try:
                import mysql.connector
                mysql_driver = mysql.connector
                driver_name = "mysql.connector"
            except ImportError:
                raise ImportError("MySQL driver not found. Install with: pip install pymysql or pip install mysql-connector-python")
        
        host = self.db_config.get("host") or self.db_config.get("server", "localhost")
        port = self.db_config.get("port", 3306)
        database = self.db_config.get("database")
        username = self.db_config.get("username") or self.db_config.get("user")
        password = self.db_config.get("password", "")
        charset = self.db_config.get("charset", "utf8mb4")
        
        print(f"🔌 Connecting to MySQL ({driver_name}): host={host}, port={port}, database={database}, username={username}")
        
        try:
            if driver_name == "pymysql":
                conn = mysql_driver.connect(
                    host=host,
                    port=port,
                    user=username,
                    password=password,
                    database=database,
                    charset=charset,
                    connect_timeout=10
                )
            else:
                # mysql.connector
                conn = mysql_driver.connect(
                    host=host,
                    port=port,
                    user=username,
                    password=password,
                    database=database,
                    charset=charset,
                    connection_timeout=10
                )
            print(f"✅ MySQL connection successful to {database}")
            return conn
        except Exception as e:
            print(f"❌ MySQL connection failed: {e}")
            raise
    
    def _get_mssql_connection(self):
        """Return MSSQL connection via ODBC. Logs connection attempts."""
        driver = self.db_config.get("driver", "ODBC Driver 17 for SQL Server")
        server = self.db_config.get("server") or self.db_config.get("host")
        database = self.db_config.get("database")
        username = self.db_config.get("username") or self.db_config.get("user")
        password = self.db_config.get("password", "")
        
        print(f"🔌 Connecting to MSSQL: server={server}, database={database}, username={username}, driver={driver}")
        
        # FreeTDS with DSN configuration
        if driver.lower() == "freetds":
            conn_str = (
                f"DRIVER={{{driver}}};"
                f"SERVERNAME={server};"  # Use SERVERNAME for DSN, not SERVER
                f"DATABASE={database};"
                f"UID={username};"
                f"PWD={password};"
            )
        else:
            # Standard ODBC Driver connection string
            port = self.db_config.get("port", 1433)
            encrypt = self.db_config.get("encrypt", "no")
            trust = self.db_config.get("trust_server_certificate", "yes")
            
            parts = [
                f"DRIVER={{{driver}}}",
                f"SERVER={server},{port}",
                f"DATABASE={database}",
                f"UID={username}",
                f"PWD={password}",
            ]
            
            if encrypt:
                parts.append(f"Encrypt={encrypt}")
            if trust:
                parts.append(f"TrustServerCertificate={trust}")
            
            conn_str = ";".join(parts)
        
        # Debug output
        debug_str = conn_str.replace(password, "****") if password else conn_str
        print(f"🔧 Connection string: {debug_str}")
        
        try:
            pyodbc = _get_pyodbc()  # Lazy import only when needed
            conn = pyodbc.connect(conn_str, timeout=10)
            print(f"✅ MSSQL connection successful to {database}")
            return conn
        except Exception as e:
            print(f"❌ Connection failed: {e}")
            raise
    # def _get_db_connection(self):
    #     """Return MSSQL connection via ODBC. Logs connection attempts."""
    #     driver = self.db_config.get("driver", "ODBC Driver 17 for SQL Server")
    #     server = self.db_config.get("server") or self.db_config.get("host")
    #     port = self.db_config.get("port", 1433)
    #     database = self.db_config.get("database")
    #     username = self.db_config.get("username") or self.db_config.get("user")
    #     password = self.db_config.get("password", "")
    #     encrypt = self.db_config.get("encrypt")
    #     trust = self.db_config.get("trust_server_certificate")

    #     print(f"🔌 Connecting to MSSQL: server={server}:{port}, database={database}, username={username}, driver={driver}")
        
    #     parts = [
    #         # f"DRIVER={{{{ {driver} }}}}",
    #         f"DRIVER={{{driver}}}",
    #         f"SERVER={server},{port}",
    #         f"DATABASE={database}",
    #         f"UID={username}",
    #         f"PWD={'*' * len(password) if password else 'EMPTY'}",
    #     ]
    #     if encrypt:
    #         parts.append(f"Encrypt={encrypt}")
    #     if trust:
    #         parts.append(f"TrustServerCertificate={trust}")
    #     conn_str = ";".join(parts)
    #     conn = pyodbc.connect(conn_str)
    #     print(f"✅ MSSQL connection successful to {database}")
    #     return conn
    
    def _is_jpeg_file(self, file_path: str) -> bool:
        """Check if file contains JPEG data by reading magic bytes."""
        try:
            with open(file_path, "rb") as f:
                header = f.read(11)
                return header[:3] == b'\xff\xd8\xff'
        except Exception:
            return False
    
    # def convert_bin_to_pdf(self, bin_path: str, output_pdf_path: Optional[str] = None) -> Optional[str]:
    #     """
    #     Convert .bin file (LEADTOOLS JPEG) to PDF for browser viewing.
        
    #     Args:
    #         bin_path: Path to .bin file
    #         output_pdf_path: Optional output path. If None, creates temp file.
        
    #     Returns:
    #         Path to created PDF file, or None if conversion failed
    #     """
    #     if not os.path.exists(bin_path):
    #         print(f"Error: File not found: {bin_path}")
    #         print("i may be the culprit")
    #         return None
        
    #     if not self._is_jpeg_file(bin_path):
    #         print(f"Error: File is not a valid JPEG: {bin_path}")
    #         return None
        
    #     try:
    #         # If no output path specified, create temp PDF
    #         if output_pdf_path is None:
    #             temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
    #             output_pdf_path = temp_pdf
            
    #         # Step 1: Read .bin as JPEG
    #         with open(bin_path, "rb") as f:
    #             img = Image.open(f)
    #             rgb = img.convert("RGB")
            
    #         # Step 2: Save as PDF
    #         rgb.save(output_pdf_path, "PDF", resolution=100.0)
            
    #         return output_pdf_path
            
    #     except Exception as e:
    #         print(f"Error converting {bin_path} to PDF: {e}")
    #         return None
    def convert_bin_to_pdf(self, bin_path: str, output_pdf_path: Optional[str] = None) -> Optional[str]:
        """
        Convert .bin file (LEADTOOLS JPEG) to PDF for browser viewing.
        
        FIXED: Always writes to temp directory, never to contentstore.
        
        Args:
            bin_path: Path to .bin file
            output_pdf_path: Optional output path. If None, creates temp file.
        
        Returns:
            Path to created PDF file, or None if conversion failed
        """
        if not os.path.exists(bin_path):
            print(f"Error: File not found: {bin_path}")
            return None
        
        if not self._is_jpeg_file(bin_path):
            print(f"Error: File is not a valid JPEG: {bin_path}")
            return None
        
        try:
            # If no output path specified, create temp PDF in temp directory
            if output_pdf_path is None:
                temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
                output_pdf_path = temp_pdf
            else:
                # CRITICAL FIX: If output path is in contentstore, redirect to temp directory
                # This prevents "Permission denied" errors when writing to read-only contentstore
                if "/contentstore/" in output_pdf_path or "\\contentstore\\" in output_pdf_path:
                    # Extract filename and create in temp directory instead
                    temp_dir = os.getenv("TEMP_PDF_DIR", "/tmp/aumentum_pdfs")
                    os.makedirs(temp_dir, exist_ok=True)
                    filename = os.path.basename(output_pdf_path)
                    output_pdf_path = os.path.join(temp_dir, filename)
                    output_pdf_path = os.path.normpath(output_pdf_path)
                    print(f"   📂 Redirecting PDF to temp directory: {output_pdf_path}")
            
            # Ensure parent directory exists
            parent_dir = os.path.dirname(output_pdf_path)
            if parent_dir:
                os.makedirs(parent_dir, exist_ok=True)
            
            # Step 1: Read .bin as JPEG
            with open(bin_path, "rb") as f:
                img = Image.open(f)
                rgb = img.convert("RGB")
            
            # Step 2: Save as PDF
            rgb.save(output_pdf_path, "PDF", resolution=100.0)
            
            return output_pdf_path
            
        except Exception as e:
            print(f"Error converting {bin_path} to PDF: {e}")
            import traceback
            traceback.print_exc()
            return None
    
    def get_document_paths_by_property(self, property_id: Optional[str] = None, 
                                       apn: Optional[str] = None) -> List[str]:
        """
        Query database for document paths associated with a property.
        
        Args:
            property_id: Property ID (if available in database)
            apn: Assessor's Parcel Number
        
        Returns:
            List of document file paths
        """
        try:
            conn = self._get_db_connection()
            cursor = conn.cursor()

            # TODO: Add property table joins once we discover the schema
            query = (
                "SELECT content_path "
                "FROM doc_storage "
                "WHERE 1=1"
            )
            cursor.execute(query)

            results = [r[0] for r in cursor.fetchall()]
            cursor.close()
            conn.close()
            return results

        except Exception as e:
            print(f"Database query error: {e}")
            return []
    
    # def lookup_document(self, content_path: str, convert_to_pdf: bool = True) -> Dict:
    #     """
    #     Lookup a document by its content path and optionally convert to PDF.
        
    #     Args:
    #         content_path: Relative path from database (e.g., "2014/11/03/file.bin")
    #         convert_to_pdf: If True, converts .bin to PDF for browser viewing
        
    #     Returns:
    #         Dictionary with document info and conversion results
    #     """
    #     result = {
    #         "content_path": content_path,
    #         "file_exists": False,
    #         "pdf_path": None,
    #         "is_jpeg": False,
    #         "error": None
    #     }
        
    #     # Construct full path
    #     if self.contentstore_base:
    #         full_path = os.path.join(self.contentstore_base, content_path)
    #         full_path = os.path.normpath(full_path)
    #     else:
    #         full_path = content_path
        
    #     # Check if file exists
    #     if not os.path.exists(full_path):
    #         result["error"] = f"File not found: {full_path}"
    #         return result
        
    #     result["file_exists"] = True
    #     result["file_path"] = full_path
        
    #     # Check if it's JPEG
    #     if full_path.lower().endswith(".bin"):
    #         result["is_jpeg"] = self._is_jpeg_file(full_path)
        
    #     # Convert to PDF if requested
    #     if convert_to_pdf and result["is_jpeg"]:
    #         pdf_path = full_path.replace(".bin", ".pdf")
    #         converted_path = self.convert_bin_to_pdf(full_path, pdf_path)
            
    #         if converted_path and os.path.exists(converted_path):
    #             result["pdf_path"] = converted_path
        
    #     return result
    
    # # ===========================
    # # LRSAdmin (MSSQL) INTEGRATIONS
    # # ===========================

    # def parse_store_url_to_path(self, store_url: str) -> Optional[str]:
    #     """Convert store://YYYY/M/D/H/M/UUID.bin to filesystem path under contentstore_base."""
    #     if not store_url or not store_url.lower().startswith("store://"):
    #         return None
    #     rel = store_url[len("store://"):]
    #     rel = rel.strip("/")
    #     if not self.contentstore_base:
    #         return rel
    #     return os.path.normpath(os.path.join(self.contentstore_base, rel))
    
    # def parse_store_url_to_path(self, store_url: str) -> Optional[str]:
    #     """
    #     Convert store://YYYY/M/D/H/M/UUID.bin to filesystem path under contentstore_base.
        
    #     Handles:
    #     - store://2014/11/3/14/22/filename.bin
    #     - store://2014/11/03/14/22/filename.bin (with zero-padded month)
    #     - Relative paths without store:// prefix
        
    #     Args:
    #         store_url: Store URL from Alfresco (e.g., store://2014/11/3/14/22/uuid.bin)
        
    #     Returns:
    #         Full filesystem path or None if invalid
    #     """
    #     if not store_url:
    #         print(f"⚠️  Empty store_url provided")
    #         return None
        
    #     # Remove store:// prefix if present
    #     if store_url.lower().startswith("store://"):
    #         rel_path = store_url[len("store://"):]
    #     else:
    #         rel_path = store_url
        
    #     # Clean up the path
    #     rel_path = rel_path.strip("/")
        
    #     if not rel_path:
    #         print(f"⚠️  Empty relative path after parsing: {store_url}")
    #         return None
        
    #     # Build full path
    #     if not self.contentstore_base:
    #         print(f"⚠️  contentstore_base not configured")
    #         return rel_path  # Return relative path as fallback
        
    #     full_path = os.path.normpath(os.path.join(self.contentstore_base, rel_path))
        
    #     print(f"🔍 Path Resolution:")
    #     print(f"   Input: {store_url}")
    #     print(f"   Base:  {self.contentstore_base}")
    #     print(f"   Full:  {full_path}")
        
    #     return full_path
    
    def parse_store_url_to_path(self, store_url: str) -> Optional[str]:
        """
        Convert store://YYYY/M/D/H/M/UUID.bin to filesystem path.
        
        FIXED: Properly handles Windows backslash vs Linux forward slash.
        
        Args:
            store_url: Store URL from Alfresco (e.g., store://2014/12/18/16/20/uuid.bin)
        
        Returns:
            Full filesystem path with OS-appropriate separators
        """
        if not store_url:
            print(f"⚠️  Empty store_url provided")
            return None
        
        # Remove store:// prefix if present
        if store_url.lower().startswith("store://"):
            rel_path = store_url[len("store://"):]
        else:
            rel_path = store_url
        
        # Clean up path
        rel_path = rel_path.strip("/").strip("\\")
        
        if not rel_path:
            print(f"⚠️  Empty relative path after parsing: {store_url}")
            return None
        
        # CRITICAL FIX: Convert forward slashes to OS-specific separator
        # Database stores with /, but Windows needs \
        path_parts = rel_path.replace("\\", "/").split("/")
        rel_path_normalized = os.sep.join(path_parts)
        
        if not self.contentstore_base:
            print(f"⚠️  contentstore_base not configured")
            return rel_path_normalized
        
        # Build full path and normalize
        full_path = os.path.join(self.contentstore_base, rel_path_normalized)
        full_path = os.path.normpath(full_path)  # Essential for Windows!
        
        print(f"🔍 Path Resolution:")
        print(f"   Store URL:  {store_url}")
        print(f"   Relative:   {rel_path_normalized}")
        print(f"   Base:       {self.contentstore_base}")
        print(f"   Full Path:  {full_path}")
        print(f"   OS:         {os.name} (sep='{os.sep}')")
        
        return full_path

    def lookup_document(self, content_path: str, convert_to_pdf: bool = True) -> Dict:
        """
        Lookup a document by its content path and optionally convert to PDF.
        
        Args:
            content_path: Can be:
                - Relative path: "2014/11/03/file.bin"
                - Store URL: "store://2014/11/03/file.bin"
                - Full path: "/path/to/contentstore/2014/11/03/file.bin"
            convert_to_pdf: If True, converts .bin to PDF for browser viewing
        
        Returns:
            Dictionary with document info and conversion results
        """
        result = {
            "content_path": content_path,
            "file_exists": False,
            "file_path": None,
            "pdf_path": None,
            "is_jpeg": False,
            "error": None
        }
        
        print(f"\n{'='*60}")
        print(f"LOOKUP DOCUMENT: {content_path}")
        print(f"{'='*60}")
        
        # Determine if this is a full path or needs resolution
        if os.path.isabs(content_path) and os.path.exists(content_path):
            # Already a full path
            full_path = content_path
            print(f"📁 Using provided full path")
        else:
            # Parse as store URL or relative path
            full_path = self.parse_store_url_to_path(content_path)
            if not full_path:
                result["error"] = f"Could not parse content path: {content_path}"
                print(f"❌ {result['error']}")
                return result
        
        # Normalize the path
        full_path = os.path.normpath(full_path)
        
        # Check if file exists
        if not os.path.exists(full_path):
            print("look well oo, i fit be culprit")
            result["error"] = f"File not found: {full_path}"
            print(f"❌ {result['error']}")
            
            # Debug: Check parent directory
            parent = os.path.dirname(full_path)
            if os.path.exists(parent):
                files = os.listdir(parent)
                print(f"📂 Parent directory exists with {len(files)} files")
                if files:
                    print(f"   Sample files: {files[:5]}")
            else:
                print(f"📂 Parent directory does not exist: {parent}")
            
            return result
        
        result["file_exists"] = True
        result["file_path"] = full_path
        print(f"✅ File found: {full_path}")
        print(f"   Size: {os.path.getsize(full_path):,} bytes")
        
        # Check if it's JPEG
        if full_path.lower().endswith(".bin"):
            result["is_jpeg"] = self._is_jpeg_file(full_path)
            print(f"🖼️  Is JPEG: {result['is_jpeg']}")
        
        # Convert to PDF if requested and it's a JPEG
        if convert_to_pdf and result["is_jpeg"]:
            # Generate PDF path (same location as .bin file)
            pdf_path = full_path.replace(".bin", ".pdf")
            
            # Check if PDF already exists
            if os.path.exists(pdf_path):
                print(f"📄 Using existing PDF: {pdf_path}")
                result["pdf_path"] = pdf_path
            else:
                print(f"🔄 Converting to PDF: {pdf_path}")
                converted_path = self.convert_bin_to_pdf(full_path, pdf_path)
                
                if converted_path and os.path.exists(converted_path):
                    result["pdf_path"] = converted_path
                    print(f"✅ PDF created: {converted_path}")
                    print(f"   Size: {os.path.getsize(converted_path):,} bytes")
                else:
                    result["error"] = "PDF conversion failed"
                    print(f"❌ {result['error']}")
        
        return result

    def list_recent_lr_source_documents(self, limit: int = 20) -> List[Dict]:
        """Fetch recent source documents from lr_source_document."""
        try:
            conn = self._get_db_connection()
            cursor = conn.cursor()

            # MySQL uses LIMIT instead of TOP
            db_type = self.db_config.get("type", "mssql").lower()
            if db_type == "mysql":
                query = (
                    "SELECT id, document_number, document_type, acceptance, recordation, comments, page_count "
                    "FROM lr_source_document ORDER BY id DESC LIMIT %s"
                )
                cursor.execute(query, (int(limit),))
            else:
                # MSSQL uses TOP
                query = (
                    "SELECT TOP (?) id, document_number, document_type, acceptance, recordation, comments, page_count "
                    "FROM LRSAdmin.lr_source_document ORDER BY id DESC"
                )
                query_exec = query.replace("(?)", str(int(limit)))
                cursor.execute(query_exec)
            rows = cursor.fetchall()
            cols = [c[0] for c in cursor.description]
            results = [dict(zip(cols, row)) for row in rows]
            cursor.close()
            conn.close()
            return results
        except Exception as e:
            print(f"LRS query error (lr_source_document): {e}")
            return []

    def list_recent_alf_content(self, limit: int = 20) -> List[Dict]:
        """Fetch recent content nodes and URLs from Alfresco tables."""
        try:
            conn = self._get_db_connection()
            cursor = conn.cursor()

            # MySQL uses LIMIT instead of TOP, and no schema prefix
            db_type = self.db_config.get("type", "mssql").lower()
            if db_type == "mysql":
                query = (
                    "SELECT n.id, n.uuid, cu.content_url, cu.content_size, mt.mimetype_str "
                    "FROM alf_node n "
                    "INNER JOIN alf_content_data cd ON n.id = cd.id "
                    "INNER JOIN alf_content_url cu ON cd.content_url_id = cu.id "
                    "LEFT JOIN alf_mimetype mt ON cd.content_mimetype_id = mt.id "
                    "WHERE n.node_deleted = 0 ORDER BY n.id DESC LIMIT %s"
                )
                cursor.execute(query, (int(limit),))
            else:
                # MSSQL uses TOP and schema prefix
                query = (
                    "SELECT TOP (?) n.id, n.uuid, cu.content_url, cu.content_size, mt.mimetype_str "
                    "FROM LRSAdmin.alf_node n "
                    "INNER JOIN LRSAdmin.alf_content_data cd ON n.id = cd.id "
                    "INNER JOIN LRSAdmin.alf_content_url cu ON cd.content_url_id = cu.id "
                    "LEFT JOIN LRSAdmin.alf_mimetype mt ON cd.content_mimetype_id = mt.id "
                    "WHERE n.node_deleted = 0 ORDER BY n.id DESC"
                )
                query_exec = query.replace("(?)", str(int(limit)))
                cursor.execute(query_exec)
            rows = cursor.fetchall()
            cols = [c[0] for c in cursor.description]
            results = [dict(zip(cols, row)) for row in rows]
            cursor.close()
            conn.close()
            return results
        except Exception as e:
            print(f"LRS query error (alf content): {e}")
            return []

    # def fetch_pdf_by_store_url(self, store_url: str) -> Dict:
    #     """Given a store:// URL, convert underlying .bin to PDF and return result dict."""
    #     result = {"store_url": store_url, "pdf_path": None, "file_path": None, "error": None}
    #     path = self.parse_store_url_to_path(store_url)
    #     if not path:
    #         result["error"] = "Invalid store_url"
    #         return result
    #     result["file_path"] = path
    #     if not os.path.exists(path):
    #         result["error"] = f"File not found: {path}"
    #         return result
    #     if not self._is_jpeg_file(path):
            
    #         result["error"] = "Not a JPEG/LEADTOOLS .bin"
    #         return result
    #     pdf_path = path.replace(".bin", ".pdf")
    #     converted = self.convert_bin_to_pdf(path, pdf_path)
    #     if converted and os.path.exists(converted):
    #         result["pdf_path"] = converted
    #     else:
    #         result["error"] = "Conversion failed"
    #     return result
    def fetch_pdf_by_store_url(self, store_url: str) -> Dict:
        """
        Given a store:// URL, convert underlying .bin to PDF and return result dict.
        Enhanced with better error reporting and path validation.
        """
        result = {
            "store_url": store_url,
            "pdf_path": None,
            "file_path": None,
            "error": None
        }
        
        print(f"\n{'='*60}")
        print("FETCH PDF BY STORE URL")
        print(f"{'='*60}")
        print(f"Store URL: {store_url}")
        
        # Parse the store URL to file path
        path = self.parse_store_url_to_path(store_url)
        if not path:
            result["error"] = "Invalid store_url"
            print(f"❌ {result['error']}")
            return result
        
        result["file_path"] = path
        print(f"📁 File path: {path}")
        
        # Check if file exists
        if not os.path.exists(path):
            print("why are you suspecting me")
            result["error"] = f"File not found: {path}"
            print(f"❌ {result['error']}")
            
            # Debug: Check if parent directory exists
            parent = os.path.dirname(path)
            if os.path.exists(parent):
                files = os.listdir(parent)
                print(f"📂 Parent exists with {len(files)} files")
            else:
                print(f"📂 Parent directory missing: {parent}")
            
            return result
        
        print(f"✅ File exists ({os.path.getsize(path):,} bytes)")
        
        # Check if it's a JPEG
        if not self._is_jpeg_file(path):
            result["error"] = "Not a JPEG/LEADTOOLS .bin"
            print(f"❌ {result['error']}")
            return result
        
        print(f"🖼️  Valid JPEG file")
        
        # Generate PDF path
        pdf_path = path.replace(".bin", ".pdf")
        
        # Check if PDF already exists
        if os.path.exists(pdf_path):
            print(f"📄 Using existing PDF: {pdf_path}")
            result["pdf_path"] = pdf_path
            return result
        
        # Convert to PDF
        print(f"🔄 Converting to PDF: {pdf_path}")
        converted = self.convert_bin_to_pdf(path, pdf_path)
        
        if converted and os.path.exists(converted):
            result["pdf_path"] = converted
            print(f"✅ PDF created ({os.path.getsize(converted):,} bytes)")
        else:
            result["error"] = "Conversion failed"
            print(f"❌ {result['error']}")
        
        return result

    # def resolve_store_urls_by_document_number(self, document_number: str) -> List[Dict]:
    #     """Resolve one or more store:// URLs for a given document_number using Alfresco node properties.

    #     Looks up LRSAdmin.alf_node_properties where qname local_name in ('targetRids','sourceRids')
    #     and string_value = document_number, then joins to alf tables to get content_url.
    #     """
    #     results: List[Dict] = []
    #     try:
    #         conn = self._get_db_connection()
    #         cursor = conn.cursor()
    #         # build query (pyodbc may not parameterize IN TOP; we don't use TOP here)
    #         query = (
    #             "SELECT n.id, n.uuid, cu.content_url, cu.content_size, mt.mimetype_str "
    #             "FROM LRSAdmin.alf_node_properties np "
    #             "JOIN LRSAdmin.alf_qname q ON q.id = np.qname_id "
    #             "JOIN LRSAdmin.alf_node n ON n.id = np.node_id AND n.node_deleted = 0 "
    #             "JOIN LRSAdmin.alf_content_data cd ON cd.id = n.id "
    #             "JOIN LRSAdmin.alf_content_url cu ON cu.id = cd.content_url_id "
    #             "LEFT JOIN LRSAdmin.alf_mimetype mt ON mt.id = cd.content_mimetype_id "
    #             "WHERE np.string_value = ? AND q.local_name IN ('targetRids','sourceRids')"
    #         )
    #         cursor.execute(query, (document_number,))
    #         cols = [d[0] for d in cursor.description]
    #         for row in cursor.fetchall():
    #             results.append(dict(zip(cols, row)))
    #         cursor.close()
    #         conn.close()
    #     except Exception as e:
    #         print(f"resolve_store_urls_by_document_number error: {e}")
    #     return results
    # def resolve_store_urls_by_document_number(self, document_number: str) -> List[Dict]:
    #     """
    #     Resolve one or more store:// URLs for a given document_number via lr_source_document
    #     and group by document_id (each with its own content_url set).
        
    #     FIXED: Now uses filesystem-based discovery for multi-page documents.
    #     Legacy Aumentum only stores 1 database reference but pages are on filesystem.
    #     Uses timestamp proximity to find all pages when page_count > 1.
    #     """
    #     results: List[Dict] = []
    #     try:
    #         conn = self._get_db_connection()
    #         cursor = conn.cursor()

    #         # Step 1: Get all document IDs linked to the document number
    #         cursor.execute("""
    #             SELECT id AS document_id, document_type, page_count, issued_by, create_date
    #             FROM LRSAdmin.lr_source_document
    #             WHERE document_number = ?
    #         """, (document_number,))
    #         docs = cursor.fetchall()
    #         if not docs:
    #             cursor.close()
    #             conn.close()
    #             return []

    #         # Step 2: Get the database reference (usually just 1)
    #         cursor.execute("""
    #             SELECT DISTINCT
    #                 n.id AS node_id,
    #                 n.uuid,
    #                 cu.content_url,
    #                 cu.content_size,
    #                 mt.mimetype_str
    #             FROM LRSAdmin.alf_node_properties np
    #             JOIN LRSAdmin.alf_qname q ON q.id = np.qname_id
    #             JOIN LRSAdmin.alf_node n ON n.id = np.node_id AND n.node_deleted = 0
    #             JOIN LRSAdmin.alf_content_data cd ON cd.id = n.id
    #             JOIN LRSAdmin.alf_content_url cu ON cu.id = cd.content_url_id
    #             LEFT JOIN LRSAdmin.alf_mimetype mt ON mt.id = cd.content_mimetype_id
    #             WHERE np.string_value = ? 
    #             AND q.local_name IN ('targetRids','sourceRids')
    #             ORDER BY cu.content_url
    #         """, (document_number,))
            
    #         cols = [d[0] for d in cursor.description]
    #         db_images = [dict(zip(cols, row)) for row in cursor.fetchall()]

    #         cursor.close()
    #         conn.close()
            
    #         print(f"📊 Database returned {len(db_images)} reference(s) for {document_number}")

    #         # Step 3: For each document ID, determine actual images
    #         for doc_id, doc_type, page_count, issued_by, create_date in docs:
    #             actual_images = db_images.copy()
                
    #             # CRITICAL: Use filesystem discovery if page_count > actual images
    #             if page_count and page_count > len(db_images) and len(db_images) > 0:
    #                 print(f"🔍 Document ID {doc_id}: Expected {page_count} pages, found {len(db_images)} in DB")
    #                 print(f"   Using filesystem-based discovery...")
                    
    #                 # Use filesystem discovery to find all pages
    #                 discovered_urls = self._discover_pages_by_filesystem(
    #                     db_images[0]['content_url'],
    #                     page_count
    #                 )
                    
    #                 if len(discovered_urls) > len(db_images):
    #                     # Convert URLs to image dicts
    #                     actual_images = []
    #                     for url in discovered_urls:
    #                         actual_images.append({
    #                             'node_id': None,
    #                             'uuid': None,
    #                             'content_url': url,
    #                             'content_size': None,
    #                             'mimetype_str': None
    #                         })
    #                     print(f"   ✅ Filesystem discovery found {len(actual_images)} pages")
                
    #             results.append({
    #                 "document_id": doc_id,
    #                 "document_type": doc_type,
    #                 "page_count": page_count,
    #                 "issued_by": issued_by,
    #                 "create_date": create_date.isoformat() if create_date else None,
    #                 "images": actual_images,
    #             })

    #         print(f"✅ Total: {len(results)} document ID(s) processed")

    #     except Exception as e:
    #         print(f"resolve_store_urls_by_document_number error: {e}")
    #         import traceback
    #         traceback.print_exc()

    #     return results
    # WORKAROUND: Direct file URL mapping for mislabeled nodes
    # ROOT CAUSE: During 2015 scanning, nodes were labeled with wrong document numbers
    #
    # Complete mapping based on:
    # - Database query showing which URLs are labeled with which document numbers
    # - Manual PDF verification showing what content each file actually contains
    #
    # Database Labels:
    #   Node 823587  labeled "PL11089" → URL: store://2015/3/26/.../3eee6f3f...fed.bin → Contains PL689
    #   Node 729874  labeled "PL689"   → URL: store://2015/3/17/.../879dcd53...275.bin → Contains BP102
    #   Node 907984  labeled "BP102"   → URL: store://2015/4/28/.../df4050c2...878b.bin → Contains PL6204
    #   Node 1098399 labeled "PL6204"  → URL: store://2015/7/10/.../a57f38d9...4d13.bin → Contains PL12321
    #   Node 1214297 labeled "PL12321" → NO URL (no file uploaded)
    #
    # Correct Mapping (each document gets the file that contains its content):
    CORRECT_FILE_MAPPING = {
        'PL689': {
            'correct_url': 'store://2015/3/26/15/8/3eee6f3f-0b98-41b9-a6cb-2c4488152fed.bin',
            'wrong_label': 'PL11089',
            'reason': 'File currently labeled as PL11089 actually contains PL689 content'
        },
        'BP102': {
            'correct_url': 'store://2015/3/17/10/10/879dcd53-f552-4e82-858f-7e868e60a275.bin',
            'wrong_label': 'PL689',
            'reason': 'File currently labeled as PL689 actually contains BP102 content'
        },
        'PL6204': {
            'correct_url': 'store://2015/4/28/12/15/df4050c2-af68-42cf-947a-5afb85c4878b.bin',
            'wrong_label': 'BP102',
            'reason': 'File currently labeled as BP102 actually contains PL6204 content'
        },
        'PL12321': {
            'correct_url': 'store://2015/7/10/13/19/a57f38d9-24aa-48df-835f-9497a39e4d13.bin',
            'wrong_label': 'PL6204',
            'reason': 'File currently labeled as PL6204 actually contains PL12321 content'
        },
        # NOTE: PL11089 real content not found in Alfresco - file missing or labeled differently
        # Currently PL11089 will show PL689 content (unavoidable until real file is found)
    }
    
    
    def resolve_store_urls_by_document_number(self, document_number: str) -> List[Dict]:
        """
        Resolve one or more store:// URLs for a given document_number via lr_source_document
        and group by document_id (each with its own content_url set).
        
        FIXED: Now uses exact string matching with trimming to prevent substring matches.
        WORKAROUND: Applies association redirect for mislabeled nodes.
        """
        results: List[Dict] = []
        
        # CRITICAL FIX: Trim whitespace from input to ensure exact matching
        document_number = document_number.strip()
        
        try:
            conn = self._get_db_connection()
            cursor = conn.cursor()

            # Step 1: Get all document IDs linked to the document number
            db_type = self.db_config.get("type", "mssql").lower()
            if db_type == "mysql":
                # MySQL uses TRIM() instead of RTRIM(LTRIM())
                cursor.execute("""
                    SELECT id AS document_id, document_type, page_count, issued_by, create_date
                    FROM lr_source_document
                    WHERE TRIM(document_number) = %s
                """, (document_number,))
            else:
                # MSSQL uses RTRIM(LTRIM()) and schema prefix
                cursor.execute("""
                    SELECT id AS document_id, document_type, page_count, issued_by, create_date
                    FROM LRSAdmin.lr_source_document
                    WHERE RTRIM(LTRIM(document_number)) = ?
                """, (document_number,))
            
            docs = cursor.fetchall()
            
            if not docs:
                print(f"⚠️  No documents found for document_number: '{document_number}'")
                cursor.close()
                conn.close()
                return []
            
            print(f"✅ Found {len(docs)} document(s) for '{document_number}':")
            for doc in docs:
                print(f"   - Document ID: {doc[0]}, Type: {doc[1]}, Pages: {doc[2]}")

            # Step 2: Get the database reference (usually just 1)
            if db_type == "mysql":
                # MySQL uses TRIM() and no schema prefix
                cursor.execute("""
                    SELECT DISTINCT
                        n.id AS node_id,
                        n.uuid,
                        cu.content_url,
                        cu.content_size,
                        mt.mimetype_str
                    FROM alf_node_properties np
                    JOIN alf_qname q ON q.id = np.qname_id
                    JOIN alf_node n ON n.id = np.node_id AND n.node_deleted = 0
                    LEFT JOIN alf_content_data cd ON cd.id = n.id
                    LEFT JOIN alf_content_url cu ON cu.id = cd.content_url_id
                    LEFT JOIN alf_mimetype mt ON mt.id = cd.content_mimetype_id
                    WHERE TRIM(np.string_value) = %s
                    AND q.local_name IN ('targetRids','sourceRids')
                    ORDER BY n.id
                """, (document_number,))
            else:
                # MSSQL uses RTRIM(LTRIM()) and schema prefix
                cursor.execute("""
                    SELECT DISTINCT
                        n.id AS node_id,
                        n.uuid,
                        cu.content_url,
                        cu.content_size,
                        mt.mimetype_str
                    FROM LRSAdmin.alf_node_properties np
                    JOIN LRSAdmin.alf_qname q ON q.id = np.qname_id
                    JOIN LRSAdmin.alf_node n ON n.id = np.node_id AND n.node_deleted = 0
                    LEFT JOIN LRSAdmin.alf_content_data cd ON cd.id = n.id
                    LEFT JOIN LRSAdmin.alf_content_url cu ON cu.id = cd.content_url_id
                    LEFT JOIN LRSAdmin.alf_mimetype mt ON mt.id = cd.content_mimetype_id
                    WHERE RTRIM(LTRIM(np.string_value)) = ?
                    AND q.local_name IN ('targetRids','sourceRids')
                    ORDER BY n.id
                """, (document_number,))
            
            cols = [d[0] for d in cursor.description]
            db_images = [dict(zip(cols, row)) for row in cursor.fetchall()]

            cursor.close()
            conn.close()
            
            print(f"📊 Database returned {len(db_images)} reference(s) for '{document_number}'")
            
            # WORKAROUND: Replace wrong URLs with correct URLs if we have a mapping
            mapping_applied = False
            if document_number in self.CORRECT_FILE_MAPPING:
                mapping = self.CORRECT_FILE_MAPPING[document_number]
                correct_url = mapping['correct_url']
                reason = mapping['reason']
                
                print(f"🔄 APPLYING FILE URL FIX for {document_number}")
                print(f"   Reason: {reason}")
                print(f"   Using correct file: {correct_url}")
                
                # Replace ALL images with the correct file
                db_images = [{
                    'node_id': None,
                    'uuid': None,
                    'content_url': correct_url,
                    'content_size': None,
                    'mimetype_str': None
                }]
                mapping_applied = True
                print(f"   ✅ Replaced with correct file URL")
                print(f"   🔒 Filesystem discovery will use this corrected file as reference")

            # Step 3: For each document ID, determine actual images
            for doc_id, doc_type, page_count, issued_by, create_date in docs:
                actual_images = db_images.copy()
                
                # Use filesystem discovery if page_count > actual images
                # When mapping is applied, the URL is now CORRECT, so discovery should work properly
                if page_count and page_count > len(db_images) and len(db_images) > 0:
                    if mapping_applied:
                        print(f"🔍 Document ID {doc_id}: Expected {page_count} pages, found {len(db_images)} in DB")
                        print(f"   Using filesystem discovery with CORRECTED base file...")
                        print(f"   Base file: {db_images[0]['content_url']}")
                    else:
                        print(f"🔍 Document ID {doc_id}: Expected {page_count} pages, found {len(db_images)} in DB")
                        print(f"   Using filesystem-based discovery...")
                    
                    discovered_urls = self._discover_pages_by_filesystem(
                        db_images[0]['content_url'],
                        page_count
                    )
                    
                    if len(discovered_urls) > len(db_images):
                        actual_images = []
                        for url in discovered_urls:
                            actual_images.append({
                                'node_id': None,
                                'uuid': None,
                                'content_url': url,
                                'content_size': None,
                                'mimetype_str': None
                            })
                        print(f"   ✅ Filesystem discovery found {len(actual_images)} pages")
                
                results.append({
                    "document_id": doc_id,
                    "document_type": doc_type,
                    "page_count": page_count,
                    "issued_by": issued_by,
                    "create_date": create_date.isoformat() if create_date else None,
                    "images": actual_images,
                })

            print(f"✅ Total: {len(results)} document ID(s) processed")

        except Exception as e:
            print(f"❌ resolve_store_urls_by_document_number error: {e}")
            import traceback
            traceback.print_exc()

        return results
    def _discover_pages_by_filesystem(self, reference_url: str, expected_page_count: int) -> List[str]:
        """
        Discover pages by filesystem when database only has 1 reference.
        Uses sophisticated timestamp clustering + proximity algorithm.
        
        This is the PRODUCTION-READY algorithm based on successful testing with PL11089.
        
        Strategy:
        1. Detect if directory contains multiple documents (ratio > 2x)
        2. If multi-document: Use timestamp clustering to find document boundary
        3. Refine selection using proximity to reference file
        4. Sort by timestamp to preserve page order
        
        Args:
            reference_url: The single store:// URL from database
            expected_page_count: Number of pages expected
        
        Returns:
            List of store:// URLs for all pages in correct order
        """
        try:
            # Extract directory from reference URL
            rel_path = reference_url.replace("store://", "")
            full_path = os.path.join(self.contentstore_base, rel_path.replace("/", os.sep))
            full_path = os.path.normpath(full_path)
            
            directory = os.path.dirname(full_path)
            reference_file = os.path.basename(full_path)
            
            if not os.path.exists(full_path):
                print(f"⚠️  Reference file not found: {full_path}")
                return [reference_url]
            
            if not os.path.exists(directory):
                print(f"⚠️  Directory not found: {directory}")
                return [reference_url]
            
            # Get reference file timestamp
            ref_stat = os.stat(full_path)
            ref_mtime = ref_stat.st_mtime
            
            # List all .bin files with timestamps
            all_files = []
            for filename in os.listdir(directory):
                if filename.endswith('.bin'):
                    filepath = os.path.join(directory, filename)
                    try:
                        stat = os.stat(filepath)
                        all_files.append({
                            'filename': filename,
                            'mtime': stat.st_mtime,
                            'filepath': filepath
                        })
                    except:
                        continue
            
            print(f"📊 Directory analysis:")
            print(f"   Total files: {len(all_files)}")
            print(f"   Expected pages: {expected_page_count}")
            print(f"   Ratio: {len(all_files) / expected_page_count:.2f}x")
            
            # STRATEGY SELECTION: Multi-document detection
            if len(all_files) > expected_page_count * 2:
                print(f"⚠️  Multi-document directory detected!")
                print(f"   Using timestamp clustering algorithm...")
                selected_files = self._cluster_and_select_files(
                    all_files, reference_file, ref_mtime, expected_page_count
                )
            else:
                print(f"✅ Single-document directory")
                print(f"   Using proximity algorithm...")
                selected_files = self._proximity_select_files(
                    all_files, ref_mtime, expected_page_count
                )
            
            # Convert to store URLs and sort by timestamp (preserves page order)
            dir_parts = rel_path.rsplit('/', 1)[0]
            discovered_urls = []
            for f in selected_files:
                discovered_urls.append(f"store://{dir_parts}/{f['filename']}")
            
            print(f"✅ Selected {len(discovered_urls)} files")
            
            return discovered_urls
            
        except Exception as e:
            print(f"⚠️  Filesystem discovery error: {e}")
            import traceback
            traceback.print_exc()
            return [reference_url]
    
    def _cluster_and_select_files(self, all_files: List[Dict], reference_filename: str, 
                                  ref_mtime: float, expected_count: int) -> List[Dict]:
        """
        Timestamp clustering algorithm for multi-document directories.
        
        Theory: Files uploaded together have similar timestamps with small gaps.
        Different documents have larger time gaps between them.
        
        Args:
            all_files: List of file dicts with filename, mtime, filepath
            reference_filename: Name of reference file
            ref_mtime: Modification time of reference file
            expected_count: Number of files to select
        
        Returns:
            List of selected files sorted by timestamp
        """
        # Sort by modification time
        sorted_files = sorted(all_files, key=lambda x: x['mtime'])
        
        # Find reference file index
        ref_idx = None
        for i, f in enumerate(sorted_files):
            if f['filename'] == reference_filename:
                ref_idx = i
                break
        
        if ref_idx is None:
            print(f"   ⚠️  Reference file not in sorted list, falling back to proximity")
            return self._proximity_select_files(all_files, ref_mtime, expected_count)
        
        print(f"   📍 Reference at position {ref_idx + 1}/{len(sorted_files)}")
        
        # Calculate time gaps between consecutive files
        time_gaps = []
        for i in range(len(sorted_files) - 1):
            gap = sorted_files[i + 1]['mtime'] - sorted_files[i]['mtime']
            time_gaps.append(gap)
        
        if not time_gaps:
            return [sorted_files[ref_idx]]
        
        # Find cluster threshold (gaps > median * 5, minimum 60 seconds)
        import statistics
        median_gap = statistics.median(time_gaps)
        threshold = max(median_gap * 5, 60.0)
        
        print(f"   🔍 Gap analysis: median={median_gap:.2f}s, threshold={threshold:.2f}s")
        
        # Find cluster boundaries (where gaps exceed threshold)
        cluster_starts = [0]
        for i, gap in enumerate(time_gaps):
            if gap > threshold:
                cluster_starts.append(i + 1)
        cluster_starts.append(len(sorted_files))
        
        print(f"   📊 Found {len(cluster_starts) - 1} clusters")
        
        # Find cluster containing reference file
        for i in range(len(cluster_starts) - 1):
            start = cluster_starts[i]
            end = cluster_starts[i + 1]
            
            if start <= ref_idx < end:
                cluster_files = sorted_files[start:end]
                print(f"   ✅ Reference in cluster {i + 1} ({len(cluster_files)} files)")
                
                # If cluster is larger than expected, refine using proximity
                if len(cluster_files) > expected_count:
                    print(f"   🔧 Cluster too large, refining with proximity...")
                    return self._proximity_select_files(cluster_files, ref_mtime, expected_count)
                
                return cluster_files
        
        # Fallback
        print(f"   ⚠️  Cluster detection failed, using proximity")
        return self._proximity_select_files(all_files, ref_mtime, expected_count)
    
    def _proximity_select_files(self, candidates: List[Dict], ref_mtime: float, 
                                expected_count: int) -> List[Dict]:
        """
        Proximity refinement algorithm - selects N files closest to reference by time.
        
        Args:
            candidates: List of candidate files
            ref_mtime: Reference file modification time
            expected_count: Number of files to select
        
        Returns:
            List of selected files sorted by timestamp
        """
        # Calculate time difference for each candidate
        for f in candidates:
            f['time_diff'] = abs(f['mtime'] - ref_mtime)
        
        # Sort by proximity and take N closest
        sorted_by_proximity = sorted(candidates, key=lambda x: x['time_diff'])
        selected = sorted_by_proximity[:expected_count]
        
        # Sort selected files by timestamp (preserves page order)
        selected = sorted(selected, key=lambda x: x['mtime'])
        
        if selected:
            max_diff = max(f['time_diff'] for f in selected)
            print(f"   📏 Max time deviation: {max_diff:.2f}s")
        
        return selected

    # def fetch_pdf_by_document_number(self, document_number: str) -> Dict:
    #     """Resolve store URL(s) by document_number and convert first match to PDF."""
    #     out: Dict = {"document_number": document_number, "pdf_path": None, "store_url": None, "file_path": None, "error": None}
    #     matches = self.resolve_store_urls_by_document_number(document_number)
    #     if not matches:
    #         out["error"] = "No content found for document_number"
    #         return out
    #     store_url = matches[0].get("content_url")
    #     out["store_url"] = store_url
    #     path = self.parse_store_url_to_path(store_url)
    #     if not path:
    #         out["error"] = "Invalid store_url"
    #         return out
    #     out["file_path"] = path
    #     if not os.path.exists(path):
    #         out["error"] = f"File not found: {path}"
    #         return out
    #     if not self._is_jpeg_file(path):
    #         out["error"] = "Not a JPEG/LEADTOOLS .bin"
    #         return out
    #     pdf_path = path.replace(".bin", ".pdf")
    #     converted = self.convert_bin_to_pdf(path, pdf_path)
    #     if converted and os.path.exists(converted):
    #         out["pdf_path"] = converted
    #     else:
    #         out["error"] = "Conversion failed"
    #     return out
    def fetch_pdf_by_document_number(self, document_number: str, document_id: Optional[int] = None) -> Dict:
        """
        Resolve store URL(s) by document_number and convert to PDF.
        
        FIXED: Now properly handles multiple document IDs per document number.
        If document_id is specified, uses that specific document.
        Otherwise, uses the first document ID found.
        
        Args:
            document_number: Document number to search for
            document_id: Optional specific document ID to use
        
        Returns:
            Dict with pdf_path, document_id, page_count, and error info
        """
        result = {
            "document_number": document_number,
            "document_id": None,
            "pdf_path": None,
            "page_count": 0,
            "error": None
        }
        
        print(f"\n{'='*60}")
        print(f"FETCH PDF BY DOCUMENT NUMBER: {document_number}")
        if document_id:
            print(f"   Specific Document ID: {document_id}")
        print(f"{'='*60}")
        
        # Step 1: Resolve store URLs from database (grouped by document_id)
        print(f"🔍 Querying database for document {document_number}...")
        doc_groups = self.resolve_store_urls_by_document_number(document_number)
        
        if not doc_groups:
            result["error"] = f"No content found for document_number: {document_number}"
            print(f"❌ {result['error']}")
            return result
        
        print(f"✅ Found {len(doc_groups)} document ID(s)")
        
        # Step 2: Select which document ID to use
        selected_doc = None
        if document_id:
            # Find specific document ID
            for doc in doc_groups:
                if doc["document_id"] == document_id:
                    selected_doc = doc
                    break
            
            if not selected_doc:
                result["error"] = f"Document ID {document_id} not found for document number {document_number}"
                print(f"❌ {result['error']}")
                return result
        else:
            # Use first document ID
            selected_doc = doc_groups[0]
            print(f"⚠️  Multiple document IDs found, using first one: {selected_doc['document_id']}")
        
        result["document_id"] = selected_doc["document_id"]
        result["page_count"] = selected_doc["page_count"]
        
        print(f"📋 Using Document ID: {selected_doc['document_id']}")
        print(f"   Document Type: {selected_doc['document_type']}")
        print(f"   Page Count: {selected_doc['page_count']}")
        print(f"   Images Available: {len(selected_doc['images'])}")
        
        # Step 3: Generate PDF for this specific document ID
        # Delegate to generate_pdf_for_document with specific document_id
        return self.generate_pdf_for_document(
            document_number=document_number,
            document_id=selected_doc["document_id"]
        )

    def resolve_store_url_by_uuid(self, uuid: str) -> Dict:
        """Resolve store:// URL by Alfresco UUID (native Aumentum method).
        
        This mimics how Aumentum Web Access retrieves documents via:
        /documentPage.do?uuid=<uuid>
        """
        result: Dict = {}
        try:
            conn = self._get_db_connection()
            cursor = conn.cursor()
            
            db_type = self.db_config.get("type", "mssql").lower()
            if db_type == "mysql":
                query = (
                    "SELECT n.id, n.uuid, cu.content_url, cu.content_size, mt.mimetype_str "
                    "FROM alf_node n "
                    "JOIN alf_content_data cd ON cd.id = n.id "
                    "JOIN alf_content_url cu ON cu.id = cd.content_url_id "
                    "LEFT JOIN alf_mimetype mt ON mt.id = cd.content_mimetype_id "
                    "WHERE n.uuid = %s AND n.node_deleted = 0"
                )
                cursor.execute(query, (uuid,))
            else:
                query = (
                    "SELECT n.id, n.uuid, cu.content_url, cu.content_size, mt.mimetype_str "
                    "FROM LRSAdmin.alf_node n "
                    "JOIN LRSAdmin.alf_content_data cd ON cd.id = n.id "
                    "JOIN LRSAdmin.alf_content_url cu ON cu.id = cd.content_url_id "
                    "LEFT JOIN LRSAdmin.alf_mimetype mt ON mt.id = cd.content_mimetype_id "
                    "WHERE n.uuid = ? AND n.node_deleted = 0"
                )
                cursor.execute(query, (uuid,))
            cols = [d[0] for d in cursor.description]
            row = cursor.fetchone()
            if row:
                result = dict(zip(cols, row))
            cursor.close()
            conn.close()
        except Exception as e:
            print(f"resolve_store_url_by_uuid error: {e}")
        return result

    def fetch_pdf_by_uuid(self, uuid: str) -> Dict:
        """Fetch and convert PDF by Alfresco UUID (native Aumentum method)."""
        out: Dict = {"uuid": uuid, "pdf_path": None, "store_url": None, "file_path": None, "error": None}
        match = self.resolve_store_url_by_uuid(uuid)
        if not match:
            out["error"] = "No content found for UUID"
            return out
        store_url = match.get("content_url")
        out["store_url"] = store_url
        path = self.parse_store_url_to_path(store_url)
        if not path:
            out["error"] = "Invalid store_url"
            return out
        out["file_path"] = path
        if not os.path.exists(path):
            out["error"] = f"File not found: {path}"
            return out
        if not self._is_jpeg_file(path):
            out["error"] = "Not a JPEG/LEADTOOLS .bin"
            return out
        pdf_path = path.replace(".bin", ".pdf")
        converted = self.convert_bin_to_pdf(path, pdf_path)
        if converted and os.path.exists(converted):
            out["pdf_path"] = converted
        else:
            out["error"] = "Conversion failed"
        return out
    
    def discover_database_schema(self) -> Dict:
        """
        Discover Aumentum database schema to understand property/document relationships.
        
        Returns:
            Dictionary with schema information
        """
        schema_info = {
            "tables": [],
            "property_tables": [],
            "document_tables": [],
            "errors": []
        }
        
        try:
            conn = self._get_db_connection()
            cursor = conn.cursor()

            # SQL Server: list tables
            cursor.execute("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE='BASE TABLE'")
            tables = [row[0] for row in cursor.fetchall()]
            schema_info["tables"] = tables

            # Try to get doc_storage schema
            if "doc_storage" in tables:
                cursor.execute("SELECT COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'doc_storage'")
                schema_info["doc_storage_schema"] = cursor.fetchall()

            # Categorize tables
            property_keywords = ["property", "parcel", "lot", "owner", "assessment", "apn"]
            for table in schema_info["tables"]:
                table_lower = table.lower()
                if any(keyword in table_lower for keyword in property_keywords):
                    schema_info["property_tables"].append(table)

            doc_keywords = ["doc", "file", "image", "storage", "content"]
            for table in schema_info["tables"]:
                table_lower = table.lower()
                if any(keyword in table_lower for keyword in doc_keywords):
                    schema_info["document_tables"].append(table)

            cursor.close()
            conn.close()

        except Exception as e:
            schema_info["errors"].append(str(e))
        
        return schema_info
    
    # def generate_pdf_for_document(self, document_number: str, cache_dir: str = None) -> dict:
    #     """
    #     Generate multi-page PDF for a document by combining all pages.
        
    #     FIXED: Properly handles path separators and multi-page documents.
        
    #     Args:
    #         document_number: Document number from lr_source_document
    #         cache_dir: Directory for cached PDFs
        
    #     Returns:
    #         dict with success, pdf_path, expected_pages, actual_pages, errors
    #     """
    #     result = {
    #         "document_number": document_number,
    #         "success": False,
    #         "pdf_path": None,
    #         "expected_pages": 0,
    #         "actual_pages": 0,
    #         "errors": []
    #     }
        
    #     print(f"\n{'='*80}")
    #     print(f"GENERATE PDF FOR DOCUMENT: {document_number}")
    #     print(f"{'='*80}")
        
    #     try:
    #         # Setup cache directory
    #         cache_dir = cache_dir or os.getenv("TEMP_PDF_DIR", "./temp_pdfs")
    #         os.makedirs(cache_dir, exist_ok=True)
    #         cached_pdf = os.path.join(cache_dir, f"{document_number}.pdf")
    #         cached_pdf = os.path.normpath(cached_pdf)  # Normalize cache path too!
            
    #         print(f"📁 Cache directory: {cache_dir}")
    #         print(f"📄 Target PDF: {cached_pdf}")
            
    #         # Check if PDF already exists in cache
    #         if os.path.exists(cached_pdf):
    #             print(f"✅ Using cached PDF: {cached_pdf}")
    #             result["success"] = True
    #             result["pdf_path"] = cached_pdf
                
    #             # Try to get page count from existing PDF
    #             try:
    #                 from PyPDF2 import PdfReader
    #                 reader = PdfReader(cached_pdf)
    #                 result["actual_pages"] = len(reader.pages)
    #                 print(f"📑 Cached PDF has {result['actual_pages']} pages")
    #             except:
    #                 result["actual_pages"] = 1
                
    #             return result
            
    #         # Step 1: Get document metadata from database
    #         print(f"\n1️⃣ Fetching document metadata from database...")
    #         conn = self._get_db_connection()
    #         cursor = conn.cursor()
            
    #         cursor.execute(
    #             "SELECT page_count FROM LRSAdmin.lr_source_document WHERE document_number = ?",
    #             (document_number,)
    #         )
    #         row = cursor.fetchone()
            
    #         if not row:
    #             result["errors"].append(f"Document not found: {document_number}")
    #             cursor.close()
    #             conn.close()
    #             return result
            
    #         page_count = row[0] or 1
    #         result["expected_pages"] = page_count
    #         print(f"   ✅ Found document with {page_count} page(s)")
            
    #         cursor.close()
    #         conn.close()
            
    #         # Step 2: Resolve store URLs for all pages
    #         print(f"\n2️⃣ Resolving content URLs...")
    #         matches = self.resolve_store_urls_by_document_number(document_number)
            
    #         if not matches:
    #             result["errors"].append("No content found in Alfresco")
    #             return result
            
    #         print(f"   ✅ Found {len(matches)} content URL(s)")
            
    #         # Step 3: Convert each .bin page to PDF
    #         print(f"\n3️⃣ Converting pages to PDF...")
    #         page_pdfs = []
            
    #         for i, match in enumerate(matches, 1):
    #             store_url = match.get("content_url")
    #             print(f"\n   Page {i}/{len(matches)}: {store_url}")
                
    #             # CRITICAL: Use fixed parse_store_url_to_path
    #             file_path = self.parse_store_url_to_path(store_url)
                
    #             if not file_path:
    #                 error = f"Page {i}: Invalid store URL"
    #                 print(f"   ❌ {error}")
    #                 result["errors"].append(error)
    #                 continue
                
    #             print(f"      File: {file_path}")
                
    #             # Check if file exists
    #             if not os.path.exists(file_path):
    #                 error = f"Page {i}: File not found: {file_path}"
    #                 print(f"   ❌ {error}")
    #                 result["errors"].append(error)
                    
    #                 # Debug: Check parent directory
    #                 parent = os.path.dirname(file_path)
    #                 if os.path.exists(parent):
    #                     print(f"      📂 Parent exists, listing files...")
    #                     try:
    #                         files = os.listdir(parent)
    #                         print(f"         Found {len(files)} files")
    #                         if files:
    #                             print(f"         Sample: {files[:3]}")
    #                     except Exception as e:
    #                         print(f"         Error listing: {e}")
    #                 else:
    #                     print(f"      📂 Parent missing: {parent}")
                    
    #                 continue
                
    #             print(f"      ✅ File exists ({os.path.getsize(file_path):,} bytes)")
                
    #             # Check if it's a JPEG
    #             if not self._is_jpeg_file(file_path):
    #                 error = f"Page {i}: Not a JPEG file"
    #                 print(f"   ❌ {error}")
    #                 result["errors"].append(error)
    #                 continue
                
    #             print(f"      ✅ Valid JPEG")
                
    #             # Convert to temporary PDF
    #             temp_pdf = file_path.replace(".bin", f"_page{i}.pdf")
    #             temp_pdf = os.path.normpath(temp_pdf)  # Normalize!
                
    #             print(f"      🔄 Converting to: {temp_pdf}")
    #             converted = self.convert_bin_to_pdf(file_path, temp_pdf)
                
    #             if converted and os.path.exists(converted):
    #                 page_pdfs.append(converted)
    #                 print(f"      ✅ Converted ({os.path.getsize(converted):,} bytes)")
    #             else:
    #                 error = f"Page {i}: Conversion failed"
    #                 print(f"      ❌ {error}")
    #                 result["errors"].append(error)
            
    #         # Step 4: Merge PDFs if multiple pages
    #         if not page_pdfs:
    #             result["errors"].append("No pages were successfully converted")
    #             return result
            
    #         result["actual_pages"] = len(page_pdfs)
    #         print(f"\n4️⃣ Merging {len(page_pdfs)} page(s)...")
            
    #         if len(page_pdfs) == 1:
    #             # Single page - just copy
    #             import shutil
    #             shutil.copy2(page_pdfs[0], cached_pdf)
    #             print(f"   ✅ Single page copied to cache")
    #         else:
    #             # Multiple pages - merge
    #             try:
    #                 from PyPDF2 import PdfMerger
                    
    #                 merger = PdfMerger()
    #                 for pdf in page_pdfs:
    #                     merger.append(pdf)
                    
    #                 merger.write(cached_pdf)
    #                 merger.close()
    #                 print(f"   ✅ Merged {len(page_pdfs)} pages")
    #             except Exception as e:
    #                 error = f"PDF merge failed: {e}"
    #                 print(f"   ❌ {error}")
    #                 result["errors"].append(error)
    #                 return result
            
    #         # Cleanup temporary PDFs
    #         print(f"\n5️⃣ Cleaning up temporary files...")
    #         for temp_pdf in page_pdfs:
    #             try:
    #                 if temp_pdf != cached_pdf:
    #                     os.remove(temp_pdf)
    #                     print(f"   🗑️  Removed: {os.path.basename(temp_pdf)}")
    #             except Exception as e:
    #                 print(f"   ⚠️  Could not remove {temp_pdf}: {e}")
            
    #         # Final verification
    #         if os.path.exists(cached_pdf):
    #             result["success"] = True
    #             result["pdf_path"] = cached_pdf
    #             print(f"\n✅ SUCCESS: PDF generated at {cached_pdf}")
    #             print(f"   Size: {os.path.getsize(cached_pdf):,} bytes")
    #             print(f"   Pages: {result['actual_pages']}/{result['expected_pages']}")
    #         else:
    #             result["errors"].append("PDF file not found after generation")
            
    #         return result
            
    #     except Exception as e:
    #         error = f"Unexpected error: {str(e)}"
    #         print(f"\n❌ {error}")
    #         result["errors"].append(error)
    #         import traceback
    #         traceback.print_exc()
    #         return result
    # def generate_pdf_for_document(self, document_number: str, document_id: Optional[int] = None, cache_dir: str = None) -> dict:
    #     """
    #     Generate multi-page PDF for a document by combining all pages.
        
    #     FIXED: Now properly handles multiple document IDs per document number.
    #     Each document ID gets its own cached PDF file.
        
    #     Args:
    #         document_number: Document number from lr_source_document
    #         document_id: Optional specific document ID (if None, uses first one found)
    #         cache_dir: Directory for cached PDFs (defaults to /tmp/aumentum_pdfs)
        
    #     Returns:
    #         dict with success, pdf_path, document_id, expected_pages, actual_pages, errors
    #     """
    #     result = {
    #         "document_number": document_number,
    #         "document_id": document_id,
    #         "success": False,
    #         "pdf_path": None,
    #         "expected_pages": 0,
    #         "actual_pages": 0,
    #         "errors": []
    #     }
        
    #     print(f"\n{'='*80}")
    #     print(f"GENERATE PDF FOR DOCUMENT: {document_number}")
    #     if document_id:
    #         print(f"   Document ID: {document_id}")
    #     print(f"{'='*80}")
        
    #     try:
    #         # Setup cache directory - use temp directory by default
    #         cache_dir = cache_dir or os.getenv("TEMP_PDF_DIR", "/tmp/aumentum_pdfs")
    #         os.makedirs(cache_dir, exist_ok=True)
            
    #         # Step 1: Resolve store URLs grouped by document_id
    #         print(f"\n1️⃣ Resolving content URLs from database...")
    #         doc_groups = self.resolve_store_urls_by_document_number(document_number)
            
    #         if not doc_groups:
    #             result["errors"].append(f"No content found for document number: {document_number}")
    #             return result
            
    #         print(f"   ✅ Found {len(doc_groups)} document ID(s)")
            
    #         # Step 2: Select which document ID to process
    #         selected_doc = None
    #         if document_id:
    #             # Find specific document ID
    #             for doc in doc_groups:
    #                 if doc["document_id"] == document_id:
    #                     selected_doc = doc
    #                     break
                
    #             if not selected_doc:
    #                 result["errors"].append(f"Document ID {document_id} not found for document number {document_number}")
    #                 return result
    #         else:
    #             # Use first document ID
    #             selected_doc = doc_groups[0]
    #             if len(doc_groups) > 1:
    #                 print(f"   ⚠️  Multiple document IDs found, using first one: {selected_doc['document_id']}")
            
    #         # Update result with selected document info
    #         result["document_id"] = selected_doc["document_id"]
    #         result["expected_pages"] = selected_doc["page_count"] or 1
            
    #         print(f"\n2️⃣ Processing Document ID: {selected_doc['document_id']}")
    #         print(f"   Document Type: {selected_doc['document_type']}")
    #         print(f"   Expected Pages: {selected_doc['page_count']}")
    #         print(f"   Available Images: {len(selected_doc['images'])}")
            
    #         # Create unique cache filename using document_id to avoid conflicts
    #         cached_pdf = os.path.join(cache_dir, f"{document_number}_doc{selected_doc['document_id']}.pdf")
    #         cached_pdf = os.path.normpath(cached_pdf)
            
    #         print(f"📄 Target PDF: {cached_pdf}")
            
    #         # Check if PDF already exists in cache
    #         if os.path.exists(cached_pdf):
    #             print(f"✅ Using cached PDF: {cached_pdf}")
    #             result["success"] = True
    #             result["pdf_path"] = cached_pdf
                
    #             # Try to get page count from existing PDF
    #             try:
    #                 from PyPDF2 import PdfReader
    #                 reader = PdfReader(cached_pdf)
    #                 result["actual_pages"] = len(reader.pages)
    #                 print(f"📑 Cached PDF has {result['actual_pages']} pages")
    #             except:
    #                 result["actual_pages"] = 1
                
    #             return result
            
    #         # Step 3: Convert each .bin page to PDF
    #         print(f"\n3️⃣ Converting {len(selected_doc['images'])} page(s) to PDF...")
    #         page_pdfs = []
            
    #         for i, image_info in enumerate(selected_doc['images'], 1):
    #             store_url = image_info.get("content_url")
    #             print(f"\n   Page {i}/{len(selected_doc['images'])}: {store_url}")
                
    #             # Parse store URL to file path
    #             file_path = self.parse_store_url_to_path(store_url)
                
    #             if not file_path:
    #                 error = f"Page {i}: Invalid store URL"
    #                 print(f"      ❌ {error}")
    #                 result["errors"].append(error)
    #                 continue
                
    #             print(f"      File: {file_path}")
                
    #             # Check if file exists
    #             if not os.path.exists(file_path):
    #                 error = f"Page {i}: File not found: {file_path}"
    #                 print(f"      ❌ {error}")
    #                 result["errors"].append(error)
                    
    #                 # Debug: Check parent directory
    #                 parent = os.path.dirname(file_path)
    #                 if os.path.exists(parent):
    #                     print(f"      📂 Parent exists, listing files...")
    #                     try:
    #                         files = os.listdir(parent)
    #                         print(f"         Found {len(files)} files")
    #                         if files:
    #                             print(f"         Sample: {files[:3]}")
    #                     except Exception as e:
    #                         print(f"         Error listing: {e}")
    #                 else:
    #                     print(f"      📂 Parent missing: {parent}")
                    
    #                 continue
                
    #             print(f"      ✅ File exists ({os.path.getsize(file_path):,} bytes)")
                
    #             # Check if it's a JPEG
    #             if not self._is_jpeg_file(file_path):
    #                 error = f"Page {i}: Not a JPEG file"
    #                 print(f"      ❌ {error}")
    #                 result["errors"].append(error)
    #                 continue
                
    #             print(f"      ✅ Valid JPEG")
                
    #             # Convert to temporary PDF in cache directory (NOT contentstore!)
    #             temp_filename = f"{document_number}_doc{selected_doc['document_id']}_page{i}.pdf"
    #             temp_pdf = os.path.join(cache_dir, temp_filename)
    #             temp_pdf = os.path.normpath(temp_pdf)
                
    #             print(f"      🔄 Converting to: {temp_pdf}")
    #             converted = self.convert_bin_to_pdf(file_path, temp_pdf)
                
    #             if converted and os.path.exists(converted):
    #                 page_pdfs.append(converted)
    #                 print(f"      ✅ Converted ({os.path.getsize(converted):,} bytes)")
    #             else:
    #                 error = f"Page {i}: Conversion failed"
    #                 print(f"      ❌ {error}")
    #                 result["errors"].append(error)
            
    #         # Step 4: Merge PDFs if multiple pages
    #         if not page_pdfs:
    #             result["errors"].append("No pages were successfully converted")
    #             return result
            
    #         result["actual_pages"] = len(page_pdfs)
    #         print(f"\n4️⃣ Merging {len(page_pdfs)} page(s)...")
            
    #         if len(page_pdfs) == 1:
    #             # Single page - just copy
    #             shutil.copy2(page_pdfs[0], cached_pdf)
    #             print(f"   ✅ Single page copied to cache")
    #         else:
    #             # Multiple pages - merge
    #             try:
    #                 from PyPDF2 import PdfMerger
                    
    #                 merger = PdfMerger()
    #                 for pdf in page_pdfs:
    #                     merger.append(pdf)
                    
    #                 merger.write(cached_pdf)
    #                 merger.close()
    #                 print(f"   ✅ Merged {len(page_pdfs)} pages")
    #             except Exception as e:
    #                 error = f"PDF merge failed: {e}"
    #                 print(f"   ❌ {error}")
    #                 result["errors"].append(error)
    #                 return result
            
    #         # Cleanup temporary PDFs
    #         print(f"\n5️⃣ Cleaning up temporary files...")
    #         for temp_pdf in page_pdfs:
    #             try:
    #                 if temp_pdf != cached_pdf:
    #                     os.remove(temp_pdf)
    #                     print(f"   🗑️  Removed: {os.path.basename(temp_pdf)}")
    #             except Exception as e:
    #                 print(f"   ⚠️  Could not remove {temp_pdf}: {e}")
            
    #         # Final verification
    #         if os.path.exists(cached_pdf):
    #             result["success"] = True
    #             result["pdf_path"] = cached_pdf
    #             print(f"\n✅ SUCCESS: PDF generated at {cached_pdf}")
    #             print(f"   Document ID: {result['document_id']}")
    #             print(f"   Size: {os.path.getsize(cached_pdf):,} bytes")
    #             print(f"   Pages: {result['actual_pages']}/{result['expected_pages']}")
    #         else:
    #             result["errors"].append("PDF file not found after generation")
            
    #         return result
            
    #     except Exception as e:
    #         error = f"Unexpected error: {str(e)}"
    #         print(f"\n❌ {error}")
    #         result["errors"].append(error)
    #         import traceback
    #         traceback.print_exc()
    #         return result
    def generate_pdf_for_document(self, document_number: str, document_id: Optional[int] = None, cache_dir: str = None) -> dict:
        """
        Generate multi-page PDF for a document by combining all pages.
        
        FIXED: Now REQUIRES document_id to prevent mixing up different documents with same number.
        
        Args:
            document_number: Document number from lr_source_document
            document_id: REQUIRED specific document ID (prevents mixing PL689 with PL11089)
            cache_dir: Directory for cached PDFs (defaults to /tmp/aumentum_pdfs)
        
        Returns:
            dict with success, pdf_path, document_id, expected_pages, actual_pages, errors
        """
        result = {
            "document_number": document_number,
            "document_id": document_id,
            "success": False,
            "pdf_path": None,
            "expected_pages": 0,
            "actual_pages": 0,
            "errors": []
        }
        
        print(f"\n{'='*80}")
        print(f"GENERATE PDF FOR DOCUMENT: {document_number}")
        if document_id:
            print(f"   Document ID: {document_id}")
        print(f"{'='*80}")
        
        try:
            # Setup cache directory - use temp directory by default
            cache_dir = cache_dir or os.getenv("TEMP_PDF_DIR", "/tmp/aumentum_pdfs")
            os.makedirs(cache_dir, exist_ok=True)
            
            # Step 1: Resolve store URLs grouped by document_id
            print(f"\n1️⃣ Resolving content URLs from database...")
            doc_groups = self.resolve_store_urls_by_document_number(document_number)
            
            if not doc_groups:
                result["errors"].append(f"No content found for document number: {document_number}")
                return result
            
            print(f"   ✅ Found {len(doc_groups)} document ID(s)")
            
            # CRITICAL FIX: Enforce document_id specification if multiple exist
            if len(doc_groups) > 1 and document_id is None:
                result["errors"].append(
                    f"Multiple document IDs found for {document_number}. "
                    f"Please specify document_id. Available IDs: "
                    f"{[doc['document_id'] for doc in doc_groups]}"
                )
                print(f"❌ {result['errors'][-1]}")
                return result
            
            # Step 2: Select which document ID to process
            selected_doc = None
            if document_id:
                # Find specific document ID
                for doc in doc_groups:
                    if doc["document_id"] == document_id:
                        selected_doc = doc
                        break
                
                if not selected_doc:
                    result["errors"].append(
                        f"Document ID {document_id} not found for document number {document_number}. "
                        f"Available IDs: {[doc['document_id'] for doc in doc_groups]}"
                    )
                    print(f"❌ {result['errors'][-1]}")
                    return result
            else:
                # Use first document ID (only if there's exactly one)
                selected_doc = doc_groups[0]
                print(f"   ℹ️  Only one document ID found, using: {selected_doc['document_id']}")
            
            # Update result with selected document info
            result["document_id"] = selected_doc["document_id"]
            result["expected_pages"] = selected_doc["page_count"] or 1
            
            print(f"\n2️⃣ Processing Document ID: {selected_doc['document_id']}")
            print(f"   Document Type: {selected_doc['document_type']}")
            print(f"   Expected Pages: {selected_doc['page_count']}")
            print(f"   Available Images: {len(selected_doc['images'])}")
            
            # CRITICAL: Validate we're not accidentally processing another document's images
            if len(selected_doc['images']) == 0:
                result["errors"].append(f"No images found for document ID {selected_doc['document_id']}")
                print(f"❌ {result['errors'][-1]}")
                return result
            
            # Create unique cache filename using BOTH document_number AND document_id
            # This prevents PL689 from overwriting PL11089
            cached_pdf = os.path.join(cache_dir, f"{document_number}_doc{selected_doc['document_id']}.pdf")
            cached_pdf = os.path.normpath(cached_pdf)
            
            print(f"📄 Target PDF: {cached_pdf}")
            
            # Check if PDF already exists in cache
            if os.path.exists(cached_pdf):
                print(f"✅ Using cached PDF: {cached_pdf}")
                result["success"] = True
                result["pdf_path"] = cached_pdf
                
                # Try to get page count from existing PDF
                try:
                    from PyPDF2 import PdfReader
                    reader = PdfReader(cached_pdf)
                    result["actual_pages"] = len(reader.pages)
                    print(f"📑 Cached PDF has {result['actual_pages']} pages")
                except:
                    result["actual_pages"] = 1
                
                return result
            
            # Step 3: Convert each .bin page to PDF
            print(f"\n3️⃣ Converting {len(selected_doc['images'])} page(s) to PDF...")
            page_pdfs = []
            
            for i, image_info in enumerate(selected_doc['images'], 1):
                store_url = image_info.get("content_url")
                print(f"\n   Page {i}/{len(selected_doc['images'])}: {store_url}")
                
                # CRITICAL: Verify this image belongs to the correct document
                # By this point, resolve_store_urls_by_document_number should have already
                # filtered by document_number, so we just need to convert
                
                # Parse store URL to file path
                file_path = self.parse_store_url_to_path(store_url)
                
                if not file_path:
                    error = f"Page {i}: Invalid store URL"
                    print(f"      ❌ {error}")
                    result["errors"].append(error)
                    continue
                
                print(f"      File: {file_path}")
                
                # Check if file exists
                if not os.path.exists(file_path):
                    error = f"Page {i}: File not found: {file_path}"
                    print(f"      ❌ {error}")
                    result["errors"].append(error)
                    continue
                
                print(f"      ✅ File exists ({os.path.getsize(file_path):,} bytes)")
                
                # Check if it's a JPEG
                if not self._is_jpeg_file(file_path):
                    error = f"Page {i}: Not a JPEG file"
                    print(f"      ❌ {error}")
                    result["errors"].append(error)
                    continue
                
                print(f"      ✅ Valid JPEG")
                
                # Convert to temporary PDF in cache directory (NOT contentstore!)
                temp_filename = f"{document_number}_doc{selected_doc['document_id']}_page{i}.pdf"
                temp_pdf = os.path.join(cache_dir, temp_filename)
                temp_pdf = os.path.normpath(temp_pdf)
                
                print(f"      🔄 Converting to: {temp_pdf}")
                converted = self.convert_bin_to_pdf(file_path, temp_pdf)
                
                if converted and os.path.exists(converted):
                    page_pdfs.append(converted)
                    print(f"      ✅ Converted ({os.path.getsize(converted):,} bytes)")
                else:
                    error = f"Page {i}: Conversion failed"
                    print(f"      ❌ {error}")
                    result["errors"].append(error)
            
            # Step 4: Merge PDFs if multiple pages
            if not page_pdfs:
                result["errors"].append("No pages were successfully converted")
                return result
            
            result["actual_pages"] = len(page_pdfs)
            print(f"\n4️⃣ Merging {len(page_pdfs)} page(s)...")
            
            if len(page_pdfs) == 1:
                # Single page - just copy
                shutil.copy2(page_pdfs[0], cached_pdf)
                print(f"   ✅ Single page copied to cache")
            else:
                # Multiple pages - merge
                try:
                    from PyPDF2 import PdfMerger
                    
                    merger = PdfMerger()
                    for pdf in page_pdfs:
                        merger.append(pdf)
                    
                    merger.write(cached_pdf)
                    merger.close()
                    print(f"   ✅ Merged {len(page_pdfs)} pages")
                except Exception as e:
                    error = f"PDF merge failed: {e}"
                    print(f"   ❌ {error}")
                    result["errors"].append(error)
                    return result
            
            # Cleanup temporary PDFs
            print(f"\n5️⃣ Cleaning up temporary files...")
            for temp_pdf in page_pdfs:
                try:
                    if temp_pdf != cached_pdf:
                        os.remove(temp_pdf)
                        print(f"   🗑️  Removed: {os.path.basename(temp_pdf)}")
                except Exception as e:
                    print(f"   ⚠️  Could not remove {temp_pdf}: {e}")
            
            # Final verification
            if os.path.exists(cached_pdf):
                result["success"] = True
                result["pdf_path"] = cached_pdf
                print(f"\n✅ SUCCESS: PDF generated at {cached_pdf}")
                print(f"   Document ID: {result['document_id']}")
                print(f"   Size: {os.path.getsize(cached_pdf):,} bytes")
                print(f"   Pages: {result['actual_pages']}/{result['expected_pages']}")
            else:
                result["errors"].append("PDF file not found after generation")
            
            return result
            
        except Exception as e:
            error = f"Unexpected error: {str(e)}"
            print(f"\n❌ {error}")
            result["errors"].append(error)
            import traceback
            traceback.print_exc()
            return result

def get_contentstore_base():
    """
    Determine the contentstore base path based on environment.
    
    Priority:
    1. Environment variable CONTENTSTORE_BASE
    2. Auto-detect based on OS and hostname
    3. Default paths
    
    Returns:
        str: Path to contentstore base directory
    """
    
    # 1. Check environment variable first (highest priority)
    env_path = os.getenv("CONTENTSTORE_BASE")
    if env_path:
        if os.path.exists(env_path):
            print(f"✅ Using CONTENTSTORE_BASE from environment: {env_path}")
            return env_path
        else:
            print(f"⚠️  CONTENTSTORE_BASE set but path doesn't exist: {env_path}")
    
    # 2. Auto-detect based on OS
    import platform
    hostname = platform.node()
    os_name = platform.system()
    
    print(f"🔍 Auto-detecting contentstore path...")
    print(f"   OS: {os_name}")
    print(f"   Hostname: {hostname}")
    
    # Check common mount points
    possible_paths = []
    
    if os_name == "Linux":
        # Linux paths (in order of preference)
        # Prefer local paths over network mounts for independent setup
        home_dir = os.path.expanduser("~")
        possible_paths = [
            os.path.join(home_dir, "aumentum_contentstore"),  # Local user directory (independent)
            "/opt/aumentum/contentstore",  # Local system directory
            "/mnt/aumentum_contentstore/contentstore",  # Network mount (legacy)
            "/mnt/lrs_storage/contentstore",
            "/media/aumentum/contentstore",
            "/opt/alfresco/contentstore",
            "/var/alfresco/contentstore",
        ]
    elif os_name == "Windows":
        # Windows paths
        possible_paths = [
            r"D:\LRS_STORAGE\contentstore",
            r"C:\Alfresco\alf_data\contentstore",
            r"\\10.10.10.3\LRS_STORAGE\contentstore",  # UNC path
        ]
    elif os_name == "Darwin":  # macOS
        possible_paths = [
            "/Volumes/LRS_STORAGE/contentstore",
            "/opt/alfresco/contentstore",
        ]
    
    # Try each path
    for path in possible_paths:
        if os.path.exists(path):
            print(f"✅ Found contentstore at: {path}")
            return path
        else:
            print(f"   ✗ Not found: {path}")
    
    # 3. Fallback to default (will likely fail but allows explicit configuration)
    default_path = "/mnt/aumentum_contentstore/contentstore"
    print(f"⚠️  No contentstore found, using default: {default_path}")
    print(f"⚠️  Please mount the Windows share or set CONTENTSTORE_BASE environment variable")
    return default_path


# ===========================
# CONFIGURATION
# ===========================

# Default database configuration for Aumentum Registry (LRS43)
# Supports both MSSQL and MySQL
# 
# For MSSQL:
# Platform-specific driver settings:
# Windows: "ODBC Driver 18 for SQL Server"
# Linux:    "FreeTDS" or "ODBC Driver 18 for SQL Server"
# DEFAULT_DB_CONFIG = {
#     "type": "mssql",  # Database type
#     "server": "10.10.10.3",  # or HOST\\INSTANCE for named instance
#     "port": 1433,
#     "database": "LRS43",
#     "username": "root",
#     "password": "5bad89a3",
#     "driver": "FreeTDS",  # Change to "FreeTDS" on Linux
#     "encrypt": "no",
#     "trust_server_certificate": "yes",
# }
#
# For MySQL:
# DEFAULT_DB_CONFIG = {
#     "type": "mysql",  # Database type
#     "host": "localhost",  # or MySQL server IP
#     "port": 3306,
#     "database": "LRS43",
#     "username": "root",
#     "password": "your_mysql_password",
#     "charset": "utf8mb4",
# }

# Current configuration (MSSQL)
# DEFAULT_DB_CONFIG = {
#     "type": "mssql",  # Change to "mysql" for MySQL
#     "server": "MSSQL_LRS43",  # ← Use DSN name from /etc/freetds/freetds.conf (MSSQL) or hostname (MySQL)
#     "port": 1433,              # 1433 for MSSQL, 3306 for MySQL
#     "database": "LRS43",
#     "username": "root",
#     "password": "5bad89a3",
#     "driver": "FreeTDS",  # Only used for MSSQL
#     "encrypt": "no",  # Only used for MSSQL
#     "trust_server_certificate": "yes",  # Only used for MSSQL
#     "charset": "utf8mb4",  # Only used for MySQL
# }
# Database configuration: use backend config when available, else env
try:
    from backend.app.config import get_db_config
    DEFAULT_DB_CONFIG = get_db_config()
except ImportError:
    import os
    _db_type = os.getenv("DB_TYPE", "mysql").lower()
    if _db_type == "mysql":
        DEFAULT_DB_CONFIG = {
            "type": "mysql",
            "host": os.getenv("DB_HOST", "localhost"),
            "port": int(os.getenv("DB_PORT", "3306")),
            "database": os.getenv("DB_NAME", "LRS43"),
            "username": os.getenv("DB_USER", "root"),
            "password": os.getenv("DB_PASSWORD", ""),
            "charset": os.getenv("DB_CHARSET", "utf8mb4"),
        }
    else:
        DEFAULT_DB_CONFIG = {
            "type": "mssql",
            "server": os.getenv("DB_SERVER", "MSSQL_LRS43"),
            "port": int(os.getenv("DB_PORT", "1433")),
            "database": os.getenv("DB_NAME", "LRS43"),
            "username": os.getenv("DB_USER", "root"),
            "password": os.getenv("DB_PASSWORD", ""),
            "driver": os.getenv("DB_DRIVER", "FreeTDS"),
            "encrypt": "no",
            "trust_server_certificate": "yes",
        }
# Platform-specific contentstore paths:
# Windows: r"C:\Alfresco\alf_data\contentstore"
# Linux:   "/opt/alfresco/contentstore"
# Network: "/mnt/contentstore" or "//server/share/contentstore"
# DEFAULT_CONTENTSTORE_BASE = r"C:\Alfresco\alf_data\contentstore"  # Adjust to your actual contentstore path
# DEFAULT_CONTENTSTORE_BASE = r"D:\LRS_STORAGE\contentstore"
DEFAULT_CONTENTSTORE_BASE = get_contentstore_base()


def validate_configuration():
    """Validate configuration on startup."""
    
    print("\n" + "="*80)
    print("AUMENTUM SERVICE CONFIGURATION VALIDATION")
    print("="*80)
    
    # Check contentstore
    print(f"\n📁 Contentstore Base: {DEFAULT_CONTENTSTORE_BASE}")
    
    if not os.path.exists(DEFAULT_CONTENTSTORE_BASE):
        print("❌ ERROR: Contentstore path does not exist!")
        print("\n💡 Solutions:")
        print("   1. Mount the Windows share:")
        print("      sudo mount -t cifs //10.10.10.3/LRS_STORAGE /mnt/aumentum_contentstore \\")
        print("        -o username=Administrator,password=YOURPASS")
        print("")
        print("   2. Or set environment variable:")
        print("      export CONTENTSTORE_BASE=/your/mount/point/contentstore")
        print("")
        return False
    
    print("✅ Contentstore path exists")
    
    # Check for content
    try:
        items = os.listdir(DEFAULT_CONTENTSTORE_BASE)
        year_dirs = [d for d in items if d.isdigit() and len(d) == 4]
        
        if year_dirs:
            print(f"✅ Found {len(year_dirs)} year directories: {sorted(year_dirs)[:5]}")
        else:
            print("⚠️  No year directories found - is this the correct path?")
            print(f"   Found items: {items[:10]}")
    except Exception as e:
        print(f"⚠️  Error reading contentstore: {e}")
        return False
    
    # Check database connection
    db_type = DEFAULT_DB_CONFIG.get("type", "mssql").lower()
    db_server = DEFAULT_DB_CONFIG.get("server") or DEFAULT_DB_CONFIG.get("host", "unknown")
    print(f"\n🔌 Database: {db_type.upper()} - {db_server}/{DEFAULT_DB_CONFIG['database']}")
    try:
        service = AumentumBrowserService(
            db_config=DEFAULT_DB_CONFIG,
            contentstore_base=DEFAULT_CONTENTSTORE_BASE
        )
        conn = service._get_db_connection()
        cursor = conn.cursor()
        
        # Use appropriate table name based on database type
        if db_type == "mysql":
            cursor.execute("SELECT COUNT(*) FROM lr_source_document")
        else:
            cursor.execute("SELECT COUNT(*) FROM LRSAdmin.lr_source_document")
        
        count = cursor.fetchone()[0]
        cursor.close()
        conn.close()
        print(f"✅ Database connection successful ({count:,} documents)")
    except Exception as e:
        print(f"❌ Database connection failed: {e}")
        return False
    
    print("\n" + "="*80)
    print("✅ CONFIGURATION VALID - Ready to start!")
    print("="*80 + "\n")
    
    return True


# ===========================
# EXAMPLE USAGE
# ===========================

def example_usage():
    """Example of using the Aumentum Browser Service."""
    
    # Initialize service
    service = AumentumBrowserService(
        db_config=DEFAULT_DB_CONFIG,
        contentstore_base=DEFAULT_CONTENTSTORE_BASE
    )
    
    # Discover database schema
    print("🔍 Discovering database schema...")
    schema = service.discover_database_schema()
    print(f"📋 Found {len(schema['tables'])} tables")
    print(f"🏠 Property tables: {schema['property_tables']}")
    print(f"📄 Document tables: {schema['document_tables']}")
    
    # Lookup a specific document (example path)
    example_path = "2014/11/03/0c21fb5b-8fec-433b-b5f1-e79d1bbe12e3.bin"
    print(f"\n🔎 Looking up document: {example_path}")
    
    result = service.lookup_document(example_path, convert_to_pdf=True)
    print(f"✅ File exists: {result['file_exists']}")
    print(f"📸 Is JPEG: {result['is_jpeg']}")
    if result['pdf_path']:
        print(f"📄 PDF created: {result['pdf_path']}")
    
    # Get document paths by property (placeholder for now)
    print(f"\n🏘️  Looking for property documents...")
    doc_paths = service.get_document_paths_by_property()
    print(f"📚 Found {len(doc_paths)} documents")


if __name__ == "__main__":
    validate_configuration()
    example_usage()