<?php
date_default_timezone_set('Asia/Shanghai');
require_once("tools.php");

$cdn = "https://cdn-resource.ekwing.com";
$log = dirname(__FILE__)."/word_num.log";
$logDir = dirname(__FILE__) . "/log";

$qid = 0;

$date = date("Ymd");
$time = date("His");

$logs = [
    'success' => "{$logDir}/{$date}/success/{$time}.log",
    'failure' => "{$logDir}/{$date}/failure/{$time}.log",
    'ocr_error' => "{$logDir}/{$date}/error/{$time}.log",
];

//@mkdir(dirname($logs['ocr_error']), 0777, true);
foreach ($logs as $log) {
    @mkdir(dirname($log), 0777, true);
}


@unlink($log);
@unlink("{$logDir}/table.log");
@unlink($logs['failure']);
$ids = [
//    '65129', '96290', '62035', '62034', '353', '128150',
//    '165775', '186709', '174863', '237554', '238526',
//    '238527', '238606', '238607', '239290', '241421',
//    '241901', '242715', '242716', '242795', '242875',
//    '242876', '242876', '306770', '312369', '316999',
//    '317144', '317275', '325469', '325548', '325768',
//    '325927', '326006', '326085', '326164' , '326243',
//    '357723', '395176', '399496', '399576', '399577',
//    '399816', '399817', '400375', '400455', '400775',
//    '400856', '400935', '400936', '411201', '411423',
//    '453685', '487327', '192833', '106482', '101906',
//    '99516', '99457', '98448', '237476', '237477',
//    '509046', '241274', '241275', '495824', '555060',
//    '557595', '355107', '3015', '269936', '745471',
//    '757793', '524021',
//
//
//    '10835'
];

$i = 0;
$currentId = 0;
$handle = fopen("data.csv", "r");
$firstLine = true;
$counter = 0;

//单独处理失败的情况
//$ids = trim(file_get_contents(dirname(__FILE__) . "/log/20250908/failure/105254.log"));
//$ids = trim($ids, ",");
//$ids = explode(",", $ids);
while (($line = fgetcsv($handle, 0, "\0", "\0", '\\')) !== false) {
//while (($line = fgets($handle, 4096)) !== false) {
    if ($firstLine) {
        $firstLine = false;
        continue;
    }

    $counter++;
//    if ($counter > 1500){
//        break;
//    }
    deal($line);
}

fclose($handle);

function deal($line)
{
    global $i, $ids;
    $data = json_decode($line[0], true);
    if (is_null($data)) {
        $s = trim($line[0], '"');
        $s = str_replace('""', '"', $s);
        $data = json_decode($s, true);
    }
    if (!isset($data['id'])) {
        print_r($line);
        exit;
    }
    $id = $data['id'];
    $rid = $data['id']; //因为后面都已经使用了rid, 跟acpf相关人员交流过，要使用id传递给翼声
    global $qid;
    $qid = $data['q_id'];

    $_skipImgIds = [61851, 474264, 553397, 544107, 544108, 301122, 277442, 512145, 797562, 798630, 799206, 803407, 799762, 915793, 925760, 948222, 958023, 980290, 980290, 1279828, 1497069, 1973479, 898170, 2366163, 2666557, 2346149, 2716142, 2722106, 2723969, 2782347, 2401357, 2556464];

//    if (!in_array($qid, $_skipImgIds)) {
    if (in_array($qid, $_skipImgIds)) {
        return;
    }

    if (!in_array($qid, [183405,183406,183407,186709,183404,434630,434720,541876,413783,281071,434000,1684805,1729228,1758374,1758375,2128753,2137920,2151732,2266560,2201465,2541177,2562971,2456064,2605171,2605401,2785064,2785148,2786525,2786441,2786358,2221393,2875137,2151735,2201466,2627411,2875138,3027865,2456063])) {
        return;
    }

//    if ($qid != "2767065") {
//        return;
//    }

    //只处理失败的
//    if (!in_array($id, $ids)) {
//        return;
//    }


    $data = $data['data'];

    $content = $data['cnt'];
//    $article = $content['article'][0]['content'][0]['content'];

    $article = "";
    foreach ($content['article'] as $k => $v) {
        if ($v['content'][0]['content']) {
            $article = $v['content'][0]['content'];
        }
    }

    $no = false;
    if (!$article) {
        if ($content['tip'] && $content['tip'][0]) {
            $article = $content['tip'][0];
        } else {
            //echo "没有例文:", $qid, "\n";
//            echo $article, "\n";
        }
    }

//    return;

    $other = [];
    if (isset($content['other'])) {
        $other = $content['other'];
    }
    $ask = $content['text'];

    $keys = [];
    if ($other) {
        foreach ($other as $item) {
            if ($item['type'] != 'txt') {
                echo "others 非常规 id: {$rid}";
                print_r($other);
                exit;
            }
            $word = trim($item['content']);
            if ($word) {
                $keys[] = $word;
            }
        }
    }

    $img = $content['img'];

    $imgDesc = [];
    $withs = [];

    $params = [
        'ask' => [$ask],
        'withs' => [
            'start' => null,
            'end' => null,
        ],
        'img' => [
            'url' => [],
            'desc' => [],
        ],
        'word_num' => [
            'min' => 0,
            'max' => 0,
        ],
    ];

    global $i, $currentId;
    $currentId = $rid;
    if (in_array($qid, ['766220', '1406368', '1573253', '1710245']) || in_array($qid, $_skipImgIds)) {
        $imgCnt = [];
    } else {
        $imgCnt = handelImg($img);
    }

//    echo $qid, "\n";
//    print_r($img);
//    print_r($imgCnt);
//    return;

//    echo $rid, "\n\r";
//    return;
//    if (in_array($rid, ['281'])) return;
//    if (!$imgCnt) return;
//    if ($rid != '446') return;
    if ($img && $imgCnt === false) {
//        echo "skip\n";
        return;
    }

    $response = null;
    if ($imgCnt && count($imgCnt) > 0) {
        foreach ($imgCnt as $url => $rs) {
            $run_id = $rs['workflow_run_id'];
            if (isset($rs['data'])) {
                $response = $rs;
                $rs = $rs['data']['outputs'];

                //是作文题
                if ($rs['is_essay'] == IS_ESSAY) {
                    //不需要特殊处理，直接文本做拼接， 取开头结尾
                    $markdown = trim($rs['markdown']);
                    $params['ask'][] = $markdown;
                    if ((bool)$rs['word_rule']) {
                        $params['word_num']['min'] = $rs['word_min_count'];
                        $params['word_num']['max'] = $rs['word_max_count'];
                    }

                    if (!is_null($rs['word_num'])) {
                        $params['word_num']['min'] = intval($rs['word_num']);
                    }
//                    if (isImgMarkDown($markdown)) {
                    //如果图片中包含图片则将图片描述给传过去
                    if ($rs['has_image'] == '0' && $rs['stem_desc']) {
                        if (isImgMarkDown($rs['markdown'])) {
                            //只有图片暂时还是添加到content中
                        }
                        $params['img']['url'][] = $url;
                        $params['img']['desc'][] = $rs['stem_desc'];
                    }

                    if ($rs['has_image'] == '1' && $rs['stem_desc']) {
                        $params['img']['url'][] = $url;
                        $params['img']['desc'][] = $rs['stem_desc'];
                    }

                } else {
                    //markdown为空字符，stem_desc不为空就视为纯图片
                    if ((is_null($rs['markdown']) || $rs['markdown'] === '') && $rs['stem_desc'] != '') {
                        $params['img']['url'][] = $url;
                        $params['img']['desc'][] = $rs['stem_desc'];
                    } else {
                        //有图片但是没有markdown
                        if (!$rs['markdown']) {
                            //记录日志吧，很可能是识别的问题
                            ocrErrLog('E1', $id, $url, $response);
                        }
                        if ($rs['markdown']) {
                            $params['ask'][] = trim($rs['markdown']);
                        }
                        if ($rs['has_image'] == '1' && $rs['stem_desc']) {
                            $params['img']['url'][] = $url;
                            $params['img']['desc'][] = $rs['stem_desc'];
                        }
                    }
                }

                if (isset($rs['stem_start'])) {
                    $params['withs']['start'] = $rs['stem_start'];
                }
                if (isset($rs['stem_end'])) {

                    $params['withs']['end'] = $rs['stem_end'];
                }
            } else {
                print_r($rs);
                echo "\n图片识别结果有问题\n";
            }
        }
    }

    $ask = implode("\n", $params['ask']);
    $num = getWorkNum($ask);

    if (is_array($num)) {
        $params['word_num']['min'] = max($params['word_num']['min'], $num[0]);
        $params['word_num']['max'] = max($params['word_num']['max'], $num[1]);
    } else {
        global $ids;
        if ($num == 0
            && $imgCnt
            && !in_array($rid, $ids)
            && !containSen($ask)
        ) {
//            echo "\n图片识别有问题，为识别出词数 rid:{$rid}\n";
//            return;
            $filename = array_keys($imgCnt);
//            if (count($filename) > 1) {
//                print_r($filename);
//                echo "\n错误 line 131\n";
//                exit;
//            }
            global $cdn, $log;
            $fileUrl = $filename[0];
//            $filename = md5(str_replace($cdn, '', $filename[0]));
//            file_put_contents($log, "{$filename}\n", FILE_APPEND);
            if (substr_count($ask, '|') > 2) {
                //凡是表格未识别出词数的就都认为是错误的吧
                //认为是表格图片识别失败的产物
                ocrErrLog('E2', $rid, $fileUrl, $response);
            } elseif (!is_null($params['withs']['start']) || !is_null($params['withs']['end'])) {
                //如果识别出来了开头结尾  就认为是对的
                ocrErrLog('E3', $rid, $fileUrl, $response);
            } else {
                ocrErrLog('E4', $rid, $fileUrl, $response);
            }
        } else {
            $params['word_num']['min'] = max($params['word_num']['min'], $num);
        }
    }

    $essayData = [
        'questionId' => "{$rid}",
        'name' => null,
        'stemUrls' => $params['img']['url'],
        'stemDesc' => implode("\n---\n", $params['img']['desc']),
        'stemText' => $ask,
        'minWords' => $params['word_num']['min'],
        'score' => 100,
        'answer' => $article,
        'keyWords' => $keys,
    ];

//    echo $qid, "\n";
//    print_r($essayData);exit();

    if ($params['word_num']['max']) {
        $essayData['maxWords'] = $params['word_num']['max'];
    }

    if (!is_null($params['withs']['start'])) {
        $essayData['themStart'] = $params['withs']['start'];
    }
    if (!is_null($params['withs']['end'])) {
        $essayData['themEnd'] = $params['withs']['end'];
    }

    //echo $i++, "\n";
//    print_r($essayData);
//    return;

    $rs = addEssay($essayData);
    global $logs;
    if ($rs && isset($rs['code']) && $rs['code'] == 0) {
        //成功的处理, 暂时成功不做处理了
        file_put_contents($logs['success'], "{$rid},", FILE_APPEND);
    } else {
        //失败处理, 失败了的话将参数记录日志，然后重新发
        $log_cnt = "{$qid}, {$rs['msg']}";
        if ($imgCnt) {
            $_imgs = array_keys($imgCnt);
            $_arrImgs = [];
            foreach ($_imgs as $url) {
                $url = trim($url);
                global $cdn;
                $_arrImgs[] = md5(str_replace($cdn, '', $url));
            }
            $_imgs = implode(",", $_arrImgs);
            $log_cnt = "{$log_cnt}, {$_imgs}";
        } else {
            file_put_contents($logs['failure'], "{$log_cnt}\n", FILE_APPEND);
        }
    }

    $i++;

//    if ($imgCnt) {
//        if ($i >= 100) exit;
//    }

    echo "\n$rid, deal finished.\n";
}

function handleAsk($ask) {
    $ask = preg_replace("/^\s*[\r\n]/m", "", $ask);
    //要求, 注意
    /*
     * 只处理这两个关键字
     * 暂时接口不需要就不处理了
     */
    if (strpos($ask, "要求") === false && strpos($ask, "注意") === false) {
//        file_put_contents('ask.txt', $ask, FILE_APPEND);
//        file_put_contents('ask.txt', "\n============================\n", FILE_APPEND);
    }
}

function handelImg($arrImg) {
    $rtn = [];
    global $qid;
    global $cdn, $currentId;
    foreach ($arrImg as $img) {
        $filename = dirname(__FILE__) . "\\ocr\\" . md5($img);
        $img = "{$cdn}{$img}";
        if (file_exists($filename)) {
            $rtn[$img] = json_decode(file_get_contents($filename), true);
        } else {
//            return false;
            $rs = ocr($img);
            $rtn[$img] = json_decode($rs, true);
//            echo $currentId, "\n";
//            echo $qid, ",", $img, "\n";
            echo $qid, ",";
//            print_r($rtn[$img]);
//            var_dump($rtn[$img]);
            file_put_contents($filename, $rs);
        }

    }

    return $rtn;
}

function drawLN() {
    echo "\n==================================================\n";
}

function stringSimilarity($str1, $str2) {
    // 计算两个字符串的长度
    $len1 = strlen($str1);
    $len2 = strlen($str2);

    // 如果两个字符串都为空，相似度为1
    if ($len1 == 0 && $len2 == 0) {
        return 1.0;
    }

    // 计算Levenshtein距离
    $distance = levenshtein($str1, $str2);

    // 计算最大长度
    $maxLength = max($len1, $len2);

    // 计算相似度（1 - 距离/最大长度）
    return 1 - ($distance / $maxLength);
}

function containSen($str) {
    if (preg_match('/不少于(\d+)句话/', $str)) {
        return true;
    }

    if (preg_match('/不少于[一二三四五六七八九十]+句话/', $str)) {
        return true;
    }

    return false;
}

function ocrErrLog($errCode, $id, $url, $ocrResponse) {
    global $logs;
    $strLog = "{$errCode}, status:{$ocrResponse['data']['status']}, ekw_id:{$id}, url: {$url}, workflow_run_id:{$ocrResponse['workflow_run_id']}\n";
    $logger = $logs['ocr_error'];
    file_put_contents($logger, $strLog, FILE_APPEND);
}
