import { createWorker } from 'tesseract.js';
import mammoth from 'mammoth/mammoth.browser';
import api from './Api'

const WORDS_PER_CHUNK = 512;


// Process Word document
export const processWordDoc = async (typedArray) => {
    const html = await extractTextFromWord(typedArray);
    return chunkTextFromHTML(html);
};

// Process PDF document
export const processPdf = async (pdfDoc) => {
    let textItems = await extractAllTextItems(pdfDoc);
    return extractTextChunksFromText(textItems);
};

// Process PDF document requiring OCR
export const processPdfOcr = async (pdfDoc) => {
    return chunkPdfOcr(pdfDoc);
};


export const chunkPdfOcr = async (pdfDoc) => {
    const text = await performOcrOnPdf(pdfDoc);
    return chunkText(text)
}

const performOcrOnPdf = async (pdfDoc) => {
    const worker = await createWorker("eng+nld", 1);
    let ocrText = '';
    for (let pageNum = 1; pageNum <= pdfDoc.numPages; pageNum++) {
        const page = await pdfDoc.getPage(pageNum);
        const canvas = await pdfPageToCanvas(page);
        const { data: { text } } = await worker.recognize(canvas);
        ocrText += text + '\n';
    }
    await worker.terminate();
    return ocrText
};

const pdfPageToCanvas = async (page) => {
    const viewport = page.getViewport({ scale: 1.5 });
    const canvas = document.createElement('canvas');
    const ctx = canvas.getContext('2d');
    canvas.height = viewport.height;
    canvas.width = viewport.width;

    await page.render({ canvasContext: ctx, viewport: viewport }).promise;
    return canvas;
};

export const isWordDocument = (name_of_file) => {
    return name_of_file.endsWith('.docx');
};

const extractTextFromWord = async (arrayBuffer) => {
    const result = await mammoth.convertToHtml({ arrayBuffer });
    return result.value;
};

const chunkTextFromHTML = (htmlString) => {
    const parser = new DOMParser();
    const htmlDocument = parser.parseFromString(htmlString, 'text/html');
    const body = htmlDocument.body;

    // console.log(body)

    let chunks = [];
    let currentChunk = '';

    // Regular expression to identify the start of a new chunk
    const startOfChunkRegex = /^\d+(\.\d+)*\s/;

    const processNode = (node) => {
        if (node.nodeType === Node.TEXT_NODE) {
            currentChunk += node.textContent;
        } else if (node.nodeType === Node.ELEMENT_NODE) {
            if (node.tagName === 'P') {
                const textContent = node.textContent.trim();
                if (startOfChunkRegex.test(textContent)) {
                    if (currentChunk.trim()) {
                        chunks.push(currentChunk.trim());
                    }
                    currentChunk = textContent;
                } else {
                    currentChunk += ` ${textContent}`;
                }
            } else {
                Array.from(node.childNodes).forEach(processNode);
            }
        }
    };

    Array.from(body.childNodes).forEach(processNode);

    // Add the last chunk if it exists
    if (currentChunk.trim()) {
        chunks.push(currentChunk.trim());
    }

    return chunks;
};


export const extractTextChunksFromText = (textItems) => {
    const processedText = processTextItems(textItems);
    // console.log(processedText)
    return chunkText(processedText);
};


//Function to extract all text items from the pdf.
export const extractAllTextItems = async (pdfDoc) => {
    let allTextItems = [];
    for (let pageNum = 1; pageNum <= pdfDoc.numPages; pageNum++) {
        const page = await pdfDoc.getPage(pageNum);
        const textContent = await page.getTextContent();
        allTextItems = allTextItems.concat(textContent.items);
    }
    return allTextItems;
};

//A preprocessing function that checks for legal articles to be split with spaces and concatenates them with a period.
const processTextItems = (allTextItems) => {
    let modifiedTextItems = [];
    let i = 0;
    // console.log(allTextItems)
    while (i < allTextItems.length) {
        if (isLegalArticleStart(allTextItems[i])) {
            const { combinedItem, newIndex } = processLegalArticle(allTextItems, i);
            modifiedTextItems.push(combinedItem.str);
            i = newIndex;
        }
        else if (isDutchLegalArticleStart(allTextItems[i])) {
            modifiedTextItems.push("\n" + allTextItems[i].str);
            i++;
        }  
        else {
            modifiedTextItems.push(allTextItems[i].str);
            i++;
        }
    }
    return joinTextItems(modifiedTextItems);
};

const isDutchLegalArticleStart = (item) => {
    return item.str.match(/Artikel \d+$/)
}

const isLegalArticleStart = (item) => {
    return item.str.match(/^\d+$/);
};

// Function to check if a string resembles a financial amount
const isFinancialAmount = (str) => {
    // Check for patterns like €100, $100, 100.00, etc.
    const financialPattern = /(€|\$)?\d+\.00$/;
    return financialPattern.test(str);
};

//processes legal articles start that have a digit, followed by spaces and then a digit and concatenates them with a period.
const processLegalArticle = (items, startIndex) => {
    let tempDigits = [items[startIndex].str];
    let j = startIndex + 1;
    while (j < items.length && items[j].str === ' ' && items[j + 1] && items[j + 1].str.match(/^\d+$/)) {
        tempDigits.push(items[j + 1].str);
        j += 2;
    }
    return {
        combinedItem: {
            str: tempDigits.join('.'),
            dir: items[startIndex].dir,
            width: items[startIndex].width
        },
        newIndex: j
    };
};

const joinTextItems = (items) => {
    return items.map(item => {
        // Regular expression to match numbers with a period, not ending with '00'
        const regexPattern = /\b(\d+\.(?!\d*00\b)[\d.]+)\b/;
        return regexPattern.test(item) ? '\n' + item : item + ' ';
    }).join('');
};

const findLargestWordCount = (chunks) => {
    // console.log(chunks)
    return chunks.reduce((maxCount, currentChunk) => {
      const currentCount = currentChunk.split(' ').length;
      return currentCount > maxCount ? currentCount : maxCount;
    }, 0);
  };

// Finds articles in the form of digits and periods. Extracts the part until the next match.
const chunkText = (text) => {

    let chunks = [text];
    let largestChunkSize = findLargestWordCount(chunks);
    // console.log("Initial text size:" + largestChunkSize);
    
    const dutchArticleRegexPattern = /(\nArtikel [\d]+\b) ([\s\S]*?) (?=\n\bArtikel [\d]+\b\.?|$)/g;
    chunks = regexChunking(chunks, dutchArticleRegexPattern)
    largestChunkSize = findLargestWordCount(chunks);
    // console.log("Largest text size after article chunking:" + largestChunkSize);
    
    const digitLawArticleRegexPattern = /(\n\b\d+\.[\d.]+\b\.?) ([\s\S]*?) (?=\n\b\d+\.[\d.]+\b\.?|$)/g;
    chunks = regexChunking(chunks, digitLawArticleRegexPattern)
    largestChunkSize = findLargestWordCount(chunks);
    // console.log("Largest text size after article chunking:" + largestChunkSize);
    
    chunks = naiveChunking(chunks, WORDS_PER_CHUNK)
    largestChunkSize = findLargestWordCount(chunks);
    // console.log("Largest text size after naive chunking:" + largestChunkSize);

    return chunks
};

// takes a list of chunks and splits it to smaller chunks based on a regex
const regexChunking = (chunks, regexPattern) => {
    let newChunks = [];
    

    chunks.forEach(chunk => {
        let lastMatchEnd = 0;
        let match;
        while ((match = regexPattern.exec(chunk)) !== null) {
            // Check if the match is likely a financial amount
            if (isFinancialAmount(match[1])) {
                continue; // Skip this match
            }
            const textBeforeMatch = chunk.substring(lastMatchEnd, match.index).trim();
            if (textBeforeMatch) {
                newChunks = newChunks.concat([textBeforeMatch])
            }
            const currentMatch = match[1].trim() + " " + match[2].trim();
            newChunks = newChunks.concat([currentMatch])
            lastMatchEnd = regexPattern.lastIndex;
        }
        const textAfterLastMatch = chunk.substring(lastMatchEnd).trim();
        if (textAfterLastMatch) {
            newChunks = newChunks.concat([textAfterLastMatch])
        }
      });
    
    return newChunks
}

// takes a list of strings splits it to smaller chunks when larger than a max words_per_chunks
const naiveChunking = (chunks, words_per_chunk) => {
    let newChunks = [];
    chunks.forEach(chunk => {
        // Check if the current chunk exceeds the word limit
        if (chunk.split(' ').length > words_per_chunk) {
            // Use regex to split the chunk into smaller chunks, each not exceeding the word limit
            const regexPattern = new RegExp(`(\\S+\\s+){1,${words_per_chunk}}\\S*`, 'g');
            const smallerChunks = chunk.match(regexPattern);
            if (smallerChunks) {
                // Concatenate the smaller chunks to the result array
                newChunks = newChunks.concat(smallerChunks);
            }
            } else {
            // If the chunk does not exceed the word limit, add it directly to the result array
            newChunks = newChunks.concat([chunk]);
            }
    });
    return newChunks;
  };


export const generateDocumentEmbeddings = async (chunks) => {

    const payload = chunks.map(chunk => ({ text: chunk }));
    try {
        const response = await api.post('/generateEmbedding', 
            payload
        );
        return response.data;
    } catch (error) {
        console.error("Error while generating embeddings:", error);
        return [];
    }
};
