#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import io
import re

import numpy as np
from PIL import Image

from deepdoc.vision import OCR
from rag.nlp import attach_media_context, rag_tokenizer, tokenize

ocr = OCR()

# Gemini supported MIME types
VIDEO_EXTS = [".mp4", ".mov", ".avi", ".flv", ".mpeg", ".mpg", ".webm", ".wmv", ".3gp", ".3gpp", ".mkv"]


def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
    }
    eng = lang.lower() == "english"

    parser_config = kwargs.get("parser_config", {}) or {}
    image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))

    if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
        if callback:
            callback(-1, "Video parsing requires IMAGE2TEXT model; OCR-only mode skipped.")
        return []

    img = Image.open(io.BytesIO(binary)).convert("RGB")
    doc.update(
        {
            "image": img,
            "doc_type_kwd": "image",
        }
    )
    bxs = ocr(np.array(img))
    txt = "\n".join([t[0] for _, t in bxs if t[0]])
    if callback:
        callback(0.6, "Finish OCR: (%s ...)" % txt[:12])

    if txt:
        tokenize(doc, txt, eng)
        return attach_media_context([doc], 0, image_ctx)

    if callback:
        callback(-1, "OCR returned empty text")
    return []


def vision_llm_chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
    if callback:
        callback(-1, "Vision LLM disabled; OCR-only mode active.")
    return []
