-- ================================================================================
-- MATCH DOCUMENTS TO FILES USING TRANSACTION TIMESTAMPS
-- ================================================================================
-- Strategy: Documents scanned together will have:
-- 1. Same transaction (indexing/scanning transaction type)
-- 2. Similar transaction dates
-- 3. Alfresco nodes created around same time as transaction
-- ================================================================================

-- STEP 1: Find the transaction type for "indexing" or "scanning"
-- ================================================================================
PRINT '1️⃣ Finding Scanning/Indexing Transaction Type';
PRINT '================================================================';

SELECT 
    Id,
    code,
    label,
    descr,
    category
FROM LRSAdmin.lr_dictionary
WHERE category = 'transaction_type'
AND (
    label LIKE '%scan%' 
    OR label LIKE '%index%'
    OR label LIKE '%document%'
    OR code LIKE '%scan%'
    OR code LIKE '%index%'
)
ORDER BY Id;

-- STEP 2: Get transaction information for our problem documents
-- ================================================================================
PRINT '';
PRINT '2️⃣ Transaction Information for Problem Documents';
PRINT '================================================================';

SELECT 
    sd.document_number,
    sd.id AS document_id,
    sd.document_type,
    sd.create_date AS doc_db_created,
    sd.submission AS doc_submission_date,
    t.id AS transaction_id,
    t.transaction_type,
    t.transaction_number,
    t.transaction_status,
    tt.label AS transaction_type_label,
    t.submission_date AS transaction_date,
    t.register_date,
    t.approval_date
FROM LRSAdmin.lr_source_document sd
LEFT JOIN LRSAdmin.lr_transaction_document td ON td.document_id = sd.id
LEFT JOIN LRSAdmin.lr_transaction t ON t.id = td.transaction_id
LEFT JOIN LRSAdmin.lr_dictionary tt ON tt.Id = t.transaction_type AND tt.category = 'transaction_type'
WHERE sd.document_number IN ('PL11089', 'PL689', 'BP102', 'PL6204', 'PL12321')
ORDER BY sd.document_number, sd.id;

-- STEP 3: Match Alfresco nodes by creation time
-- ================================================================================
PRINT '';
PRINT '3️⃣ Alfresco Nodes with Creation Times';
PRINT '================================================================';

SELECT 
    np.string_value AS labeled_as,
    n.id AS node_id,
    n.audit_created AS node_created_time,
    cu.content_url,
    cu.content_size,
    sd.document_number AS actual_document_in_db,
    sd.create_date AS doc_db_created
FROM LRSAdmin.alf_node_properties np
JOIN LRSAdmin.alf_qname q ON q.id = np.qname_id
JOIN LRSAdmin.alf_node n ON n.id = np.node_id AND n.node_deleted = 0
LEFT JOIN LRSAdmin.alf_content_data cd ON cd.id = n.id
LEFT JOIN LRSAdmin.alf_content_url cu ON cu.id = cd.content_url_id
LEFT JOIN LRSAdmin.lr_source_document sd ON sd.document_number = np.string_value
WHERE np.string_value IN ('PL11089', 'PL689', 'BP102', 'PL6204', 'PL12321')
AND q.local_name = 'sourceRids'
ORDER BY n.audit_created;

-- STEP 4: Find nodes created NEAR document creation times
-- ================================================================================
PRINT '';
PRINT '4️⃣ Match Nodes to Documents by Timestamp Proximity';
PRINT '================================================================';

-- For each document, find nodes created within 24 hours of document creation
WITH DocumentTimes AS (
    SELECT DISTINCT
        document_number,
        MIN(create_date) AS earliest_create_date
    FROM LRSAdmin.lr_source_document
    WHERE document_number IN ('PL11089', 'PL689', 'BP102', 'PL6204', 'PL12321')
    GROUP BY document_number
),
NodeTimes AS (
    SELECT 
        np.string_value AS labeled_as,
        n.id AS node_id,
        n.audit_created AS node_created_time,
        cu.content_url,
        cu.content_size
    FROM LRSAdmin.alf_node_properties np
    JOIN LRSAdmin.alf_qname q ON q.id = np.qname_id
    JOIN LRSAdmin.alf_node n ON n.id = np.node_id AND n.node_deleted = 0
    LEFT JOIN LRSAdmin.alf_content_data cd ON cd.id = n.id
    LEFT JOIN LRSAdmin.alf_content_url cu ON cu.id = cd.content_url_id
    WHERE q.local_name = 'sourceRids'
    AND n.audit_created BETWEEN '2015-03-01' AND '2015-08-01'
)
SELECT 
    dt.document_number AS seeking_file_for,
    dt.earliest_create_date AS doc_created,
    nt.labeled_as AS node_labeled_as,
    nt.node_id,
    nt.node_created_time,
    nt.content_url,
    DATEDIFF(HOUR, dt.earliest_create_date, CAST(nt.node_created_time AS datetime)) AS hours_diff,
    CASE 
        WHEN ABS(DATEDIFF(HOUR, dt.earliest_create_date, CAST(nt.node_created_time AS datetime))) <= 24 THEN 'MATCH'
        ELSE 'NO_MATCH'
    END AS time_match
FROM DocumentTimes dt
CROSS JOIN NodeTimes nt
WHERE nt.labeled_as IN ('PL11089', 'PL689', 'BP102', 'PL6204', 'PL12321')
AND ABS(DATEDIFF(HOUR, dt.earliest_create_date, CAST(nt.node_created_time AS datetime))) <= 72
ORDER BY dt.document_number, ABS(DATEDIFF(HOUR, dt.earliest_create_date, CAST(nt.node_created_time AS datetime)));

-- STEP 5: Smart matching - Find the correct file for each document
-- ================================================================================
PRINT '';
PRINT '5️⃣ Suggested Correct Mappings (Based on Timestamp Proximity)';
PRINT '================================================================';

-- Match documents to nodes based on:
-- 1. Document create_date should be close to node audit_created
-- 2. Document submission date should match URL date (year at least)
WITH RankedMatches AS (
    SELECT 
        sd.document_number,
        sd.id AS document_id,
        sd.create_date AS doc_created,
        sd.submission AS doc_submission,
        np.string_value AS node_labeled_as,
        n.id AS node_id,
        n.audit_created AS node_created,
        cu.content_url,
        cu.content_size,
        ABS(DATEDIFF(HOUR, sd.create_date, CAST(n.audit_created AS datetime))) AS hours_diff,
        ROW_NUMBER() OVER (
            PARTITION BY sd.document_number 
            ORDER BY ABS(DATEDIFF(HOUR, sd.create_date, CAST(n.audit_created AS datetime)))
        ) AS match_rank
    FROM LRSAdmin.lr_source_document sd
    CROSS JOIN LRSAdmin.alf_node_properties np
    JOIN LRSAdmin.alf_qname q ON q.id = np.qname_id
    JOIN LRSAdmin.alf_node n ON n.id = np.node_id AND n.node_deleted = 0
    LEFT JOIN LRSAdmin.alf_content_data cd ON cd.id = n.id
    LEFT JOIN LRSAdmin.alf_content_url cu ON cu.id = cd.content_url_id
    WHERE sd.document_number IN ('PL11089', 'PL689', 'BP102', 'PL6204', 'PL12321')
    AND np.string_value IN ('PL11089', 'PL689', 'BP102', 'PL6204', 'PL12321')
    AND q.local_name = 'sourceRids'
    AND sd.id IN (
        SELECT MIN(id) FROM LRSAdmin.lr_source_document 
        WHERE document_number IN ('PL11089', 'PL689', 'BP102', 'PL6204', 'PL12321')
        GROUP BY document_number
    )
)
SELECT 
    document_number AS seeking_file_for,
    node_labeled_as AS closest_node_label,
    content_url AS suggested_correct_url,
    hours_diff AS time_difference_hours,
    CASE 
        WHEN document_number = node_labeled_as THEN '✅ ALREADY CORRECT'
        WHEN hours_diff <= 24 THEN '✅ STRONG MATCH'
        WHEN hours_diff <= 72 THEN '⚠️ POSSIBLE MATCH'
        ELSE '❌ WEAK MATCH'
    END AS confidence
FROM RankedMatches
WHERE match_rank <= 3
ORDER BY document_number, match_rank;

PRINT '';
PRINT '================================================================';
PRINT 'Analysis Complete';
PRINT '================================================================';
PRINT 'Next Steps:';
PRINT '1. Review timestamp matches above';
PRINT '2. Use "STRONG MATCH" or "POSSIBLE MATCH" to identify correct files';
PRINT '3. Update CORRECT_FILE_MAPPING in Python code';
PRINT '================================================================';

