"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
    return new (P || (P = Promise))(function (resolve, reject) {
        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
        step((generator = generator.apply(thisArg, _arguments || [])).next());
    });
};
var __generator = (this && this.__generator) || function (thisArg, body) {
    var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
    return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
    function verb(n) { return function (v) { return step([n, v]); }; }
    function step(op) {
        if (f) throw new TypeError("Generator is already executing.");
        while (g && (g = 0, op[0] && (_ = 0)), _) try {
            if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
            if (y = 0, t) op = [op[0] & 2, t.value];
            switch (op[0]) {
                case 0: case 1: t = op; break;
                case 4: _.label++; return { value: op[1], done: false };
                case 5: _.label++; y = op[1]; op = [0]; continue;
                case 7: op = _.ops.pop(); _.trys.pop(); continue;
                default:
                    if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
                    if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
                    if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
                    if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
                    if (t[2]) _.ops.pop();
                    _.trys.pop(); continue;
            }
            op = body.call(thisArg, _);
        } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
        if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
    }
};
var __values = (this && this.__values) || function(o) {
    var s = typeof Symbol === "function" && Symbol.iterator, m = s && o[s], i = 0;
    if (m) return m.call(o);
    if (o && typeof o.length === "number") return {
        next: function () {
            if (o && i >= o.length) o = void 0;
            return { value: o && o[i++], done: !o };
        }
    };
    throw new TypeError(s ? "Object is not iterable." : "Symbol.iterator is not defined.");
};
var __read = (this && this.__read) || function (o, n) {
    var m = typeof Symbol === "function" && o[Symbol.iterator];
    if (!m) return o;
    var i = m.call(o), r, ar = [], e;
    try {
        while ((n === void 0 || n-- > 0) && !(r = i.next()).done) ar.push(r.value);
    }
    catch (error) { e = { error: error }; }
    finally {
        try {
            if (r && !r.done && (m = i["return"])) m.call(i);
        }
        finally { if (e) throw e.error; }
    }
    return ar;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.ExtractText = void 0;
var neverthrow_1 = require("neverthrow");
var client_textract_1 = require("@aws-sdk/client-textract");
var decode_1 = require("../../util/decode");
var face_library_1 = require("../../util/face_library");
var s3_1 = require("../../util/s3");
var extractTextFromPdf = function (buffer) {
    console.warn("Trying to extract text from PDF");
    var bufferStr = buffer.toString("utf8");
    var a = [];
    var i = -1;
    var delim = ">>"; // Use >> as the closing tag
    while ((i = bufferStr.indexOf('/V ', i + 1)) >= 0) {
        a.push(i);
    }
    console.log("Locations:", a);
    return a.map(function (i) { return bufferStr.substring(i, bufferStr.indexOf(delim, i)); }).join('\n');
};
exports.ExtractText = {
    getSlugs: function (params) {
        var slugs = ['_text'];
        var metadata = JSON.parse(params.infoDefs[params.targetField].metadata || '{}');
        if (metadata && metadata.encrypt_sensitive_info) {
            for (var key in metadata.encrypt_sensitive_info) {
                slugs.push("_".concat(key));
            }
        }
        return (0, neverthrow_1.ok)({ 'slugs': ['_text', '_text_error'] });
    },
    compute: function (params) {
        var _a;
        return __awaiter(this, void 0, void 0, function () {
            var defaultReturn, key, value, uid, docs, text, s3, docs_1, docs_1_1, doc, loc, docLower, matches, bucket, path, file, error, converted, _b, textract, cmd, response, words, e_1, e_2_1, metadata, token, sensitiveInfoKey, func, dataInText, normalizedData, encrypted;
            var e_2, _c, _d, _e, _f;
            return __generator(this, function (_g) {
                switch (_g.label) {
                    case 0:
                        defaultReturn = {};
                        if (params.targetField.endsWith('_text') && params.newInfoKeys[params.targetField]) {
                            // only compute once.
                            console.log("Already extracted");
                            //return ok({});
                        }
                        if (!params.conn) {
                            console.warn("Cannot extract text without a connection.");
                            // This is for optimistic compute, so don't return an error.
                            // Just don't compute this field. (It will be computed when called via the API)
                            return [2 /*return*/, (0, neverthrow_1.ok)(defaultReturn)];
                        }
                        key = params.targetField.replace(/\_text$/, '');
                        value = params.newInfoKeys[key];
                        uid = params.newInfoKeys['uid'];
                        docs = (',' + (value || '')).split(",https:").filter(function (url) { return url && url.startsWith('//'); }).map(function (url) { return "https:".concat(url); });
                        console.debug("Extracting text for field ".concat(params.targetField), value);
                        text = '';
                        s3 = (0, s3_1.getS3)();
                        _g.label = 1;
                    case 1:
                        _g.trys.push([1, 13, 14, 15]);
                        docs_1 = __values(docs), docs_1_1 = docs_1.next();
                        _g.label = 2;
                    case 2:
                        if (!!docs_1_1.done) return [3 /*break*/, 12];
                        doc = docs_1_1.value;
                        loc = doc;
                        docLower = doc.toLowerCase();
                        if (docLower.indexOf('twilio') !== -1) {
                            console.log("Skipping Twilio");
                            return [3 /*break*/, 11];
                        }
                        if (((_a = params.computeOptions) === null || _a === void 0 ? void 0 : _a.skipHeics) && (docLower.endsWith('heic') || docLower.endsWith('pdf'))) {
                            console.log("Skipping HEIC/PDF for text extraction");
                            return [3 /*break*/, 11];
                        }
                        matches = doc.match(/^https:\/\/(.*)\.s3\.(?:us-east-2\.)?amazonaws\.com\/(.*)$/);
                        bucket = matches && matches[1] || '';
                        console.log(matches, bucket);
                        if (!(bucket === 'aidkit-documents' || bucket === 'workersfund-documents')) return [3 /*break*/, 10];
                        path = (0, decode_1.safeDecodeURIComponent)(matches && matches[2] || '');
                        file = path.split('/').pop();
                        error = "";
                        if (!(docLower.includes('.heic') || docLower.includes('.pdf'))) return [3 /*break*/, 5];
                        console.log("Converting heic/pdf for text extraction..", docLower, bucket);
                        _b = face_library_1.convertToJpg;
                        _d = {
                            loc: {
                                Client: s3,
                                Bucket: bucket,
                                Key: matches[2]
                            }
                        };
                        return [4 /*yield*/, params.aidkitCrypto];
                    case 3: return [4 /*yield*/, _b.apply(void 0, [(_d.token = (_g.sent()),
                                _d)])];
                    case 4:
                        converted = _g.sent();
                        if (converted.isOk()) {
                            path = converted.value;
                        }
                        else {
                            error = converted.error;
                        }
                        _g.label = 5;
                    case 5:
                        if (!path || error) {
                            return [2 /*return*/, (0, neverthrow_1.ok)((_e = {}, _e[key + "_text_error"] = error || "path_not_defined", _e))];
                        }
                        console.log("Extracting text from:", path);
                        textract = new client_textract_1.TextractClient({ region: 'us-east-2' });
                        cmd = new client_textract_1.DetectDocumentTextCommand({
                            Document: {
                                S3Object: {
                                    Bucket: bucket,
                                    Name: path
                                }
                            },
                        });
                        _g.label = 6;
                    case 6:
                        _g.trys.push([6, 8, , 9]);
                        return [4 /*yield*/, textract.send(cmd)];
                    case 7:
                        response = _g.sent();
                        // console.log("Textract blocks:", response.Blocks);
                        if (response.Blocks && response.Blocks.length > 0) {
                            words = (response.Blocks || []).filter(function (b) { return b.BlockType === 'WORD'; }).map(function (b) { return b.Text; }).join(' ');
                            text += words + '\n';
                        }
                        return [3 /*break*/, 9];
                    case 8:
                        e_1 = _g.sent();
                        console.log("Error extracting text:", e_1);
                        if (e_1 instanceof client_textract_1.UnsupportedDocumentException) {
                            console.log("Unsupported document type: ", file, params.newInfoKeys['uid'], key);
                            text += "\n" + file + ': { "error": "not a supported document type." }\n';
                        }
                        return [3 /*break*/, 9];
                    case 9: return [3 /*break*/, 11];
                    case 10:
                        console.log("Skipping weird location");
                        return [3 /*break*/, 11];
                    case 11:
                        docs_1_1 = docs_1.next();
                        return [3 /*break*/, 2];
                    case 12: return [3 /*break*/, 15];
                    case 13:
                        e_2_1 = _g.sent();
                        e_2 = { error: e_2_1 };
                        return [3 /*break*/, 15];
                    case 14:
                        try {
                            if (docs_1_1 && !docs_1_1.done && (_c = docs_1.return)) _c.call(docs_1);
                        }
                        finally { if (e_2) throw e_2.error; }
                        return [7 /*endfinally*/];
                    case 15:
                        metadata = JSON.parse(params.infoDefs[key].metadata || '{}');
                        if (!(metadata && metadata.encrypt_sensitive_info)) return [3 /*break*/, 17];
                        return [4 /*yield*/, params.aidkitCrypto];
                    case 16:
                        token = _g.sent();
                        if (!token) {
                            console.warn("Could not get token for encrypting sensitive info from document.");
                            return [2 /*return*/, (0, neverthrow_1.err)("cannot_encrypt_without_key")];
                        }
                        for (sensitiveInfoKey in metadata.encrypt_sensitive_info) {
                            func = void 0;
                            try {
                                func = new Function('return (function(text) { const out = ' + metadata.encrypt_sensitive_info[sensitiveInfoKey] + '; return out; })')();
                            }
                            catch (e) {
                                console.log("Error creating function:", e);
                                return [2 /*return*/, (0, neverthrow_1.err)("bad_sensitive_info_formula_1")];
                            }
                            dataInText = void 0, normalizedData = void 0;
                            try {
                                _f = __read(func(text), 2), dataInText = _f[0], normalizedData = _f[1];
                            }
                            catch (e) {
                                console.log("Error running sensitive info function:", e);
                                return [2 /*return*/, (0, neverthrow_1.err)("bad_sensitive_info_formula_2")];
                            }
                            encrypted = token.encode(normalizedData);
                            while (text.indexOf(dataInText) !== -1) {
                                text = text.replace(dataInText, encrypted);
                            }
                            defaultReturn[key + "_" + sensitiveInfoKey] = encrypted;
                        }
                        _g.label = 17;
                    case 17:
                        // Replace all non ASCII chars.
                        text = cleanString(text);
                        defaultReturn[key + "_text"] = text;
                        return [2 /*return*/, (0, neverthrow_1.ok)(defaultReturn)];
                }
            });
        });
    }
};
function cleanString(input) {
    var output = "";
    for (var i = 0; i < input.length; i++) {
        if (input.charCodeAt(i) <= 127) {
            output += input.charAt(i);
        }
    }
    output = output.replace(/[\x00-\x1F\x7F-\x9F]/g, '');
    return output;
}
