AkulaBig
|
Posted: Tue Jun 03, 2025 15:26 Post subject: |
|
|
Orion9
Мы с вами обсуждали проблемы uchardet при распознавании кодировок. Сегодня на ру-борде скрипт AkekPad для распознавания кодировок вспомнили. Посмотрите, не может-ли он нам быть полезным:
 AutoScript-DetectEx.js
Code: |
// http://akelpad.sourceforge.net/forum/viewtopic.php?p=27664#p27664
// Version: 1.0
// Author: Shengalts Aleksander aka Instructor
//
//
// Description(1033): Detect codepage by extension or by file content.
// Description(1049): Определить кодировку по расширению или по содержимому файла.
//
// Arguments:
// -DetectArray='[["ext1,ext2","ExpContent","ExpFlags","CodePage",BOM,DetectLang],["ext3",...]'
//
// "ext1,ext2" -extensions divided by semicolon. If "", any extension is matched.
// "ExpContent" -search file content with regular expression. If "", not used.
// "ExpFlags" -regular expression flags.
// "CodePage" -default codepage specified as number, as IANA charset (e.g. "utf-8") or as regular expression backreference (e.g. "$1" or "\\1").
// BOM -byte order mark. If -1, it will be autodetected.
// DetectLang -detection language number (see LANGID_* defines). Special values:
// -1 don't change current detection language.
// -2 force to use specified "CodePage" without any detection.
// -ContentBuffer=1024 Content buffer size for regular expression search. Special values:
// 0 use codepage recognition buffer size (default).
// -1 read entire file.
//
// Usage (add to "CmdLineBegin=" manual parameter):
// /Call("Scripts::Main", 2, "AutoScript-DetectEx.js", `-DetectArray='[["cmd,bat","","","866",-1,0x0419],["xml,htm,html",".*?(encoding|charset)=([a-z\\d_\\-]+)","i","$2",-1,-2]]'`)
//Arguments
var pDetectArray=AkelPad.GetArgValue("DetectArray", "");
var nContentBuffer=AkelPad.GetArgValue("ContentBuffer", 0);
//Defines
var LANGID_NONE =0; //None
var LANGID_RUSSIAN =0x0419; //Cyrillic (1251, OEM, KOI8-R, UTF-8)
var LANGID_POLISH =0x0415; //Eastern European (1250, OEM, UTF-8)
var LANGID_GERMAN =0x0407; //Western European (1252, OEM, UTF-8)
var LANGID_TURKISH =0x041F; //Turkish (1254, OEM, UTF-8)
var LANGID_CHINESE =0x0404; //Chinese (ANSI, UTF-8)
var LANGID_JAPANESE =0x0411; //Japanese (932, UTF-8)
var LANGID_KOREAN =0x0412; //Korean (949, UTF-8)
//Variables
var hMainWnd=AkelPad.GetMainWnd();
var hScript;
var lpdwFlags;
var dwFlags;
var lpFile;
var pFile;
var pExt="";
var lpnCodePage;
var lpbBOM;
var nDefaultCodepage;
var nLangCodepageRecognition;
var nNewDefaultCodepage;
var nNewBOM;
var nNewLangCodepageRecognition;
var pContent;
var pCharset;
var oPattern;
var aDetectArray=[];
var aMatch=[];
var nOffset;
var i;
if ((hScript=AkelPad.ScriptHandle(WScript.ScriptName, 3 /*SH_FINDSCRIPT*/)) && AkelPad.ScriptHandle(hScript, 13 /*SH_GETMESSAGELOOP*/))
{
//Script is running, second call close it.
AkelPad.ScriptHandle(hScript, 33 /*SH_CLOSESCRIPT*/);
}
else
{
if (pDetectArray)
eval("aDetectArray=" + pDetectArray + ";");
if (!nContentBuffer)
nContentBuffer=AkelPad.SendMessage(hMainWnd, 1222 /*AKD_GETMAININFO*/, 184 /*MI_CODEPAGERECOGNITIONBUFFER*/, 0);
if (!aDetectArray.length)
{
AkelPad.MessageBox(hMainWnd, "Too few parameters", WScript.ScriptName, 16 /*MB_ICONERROR*/);
WScript.Quit();
}
for (i=0; i < aDetectArray.length; ++i)
{
aDetectArray[i][0]=aDetectArray[i][0].toLowerCase();
}
if (AkelPad.WindowSubClass(1 /*WSC_MAINPROC*/, MainCallback, 0x435 /*AKDN_OPENDOCUMENT_START*/,
0x436 /*AKDN_OPENDOCUMENT_FINISH*/))
{
//Allow other scripts running and unlock main thread from waiting this script.
AkelPad.ScriptNoMutex(0x3 /*ULT_UNLOCKSCRIPTSQUEUE|ULT_UNLOCKPROGRAMTHREAD*/);
//Message loop
AkelPad.WindowGetMessage();
AkelPad.WindowUnsubClass(1 /*WSC_MAINPROC*/);
}
}
function MainCallback(hWnd, uMsg, wParam, lParam)
{
if (uMsg == 0x435 /*AKDN_OPENDOCUMENT_START*/)
{
nDefaultCodepage=0;
nLangCodepageRecognition=-1;
lpdwFlags=AkelPad.MemRead(lParam + (_X64?40:20) /*offsetof(NOPENDOCUMENT, dwFlags)*/, 3 /*DT_DWORD*/);
dwFlags=AkelPad.MemRead(lpdwFlags, 3 /*DT_DWORD*/);
if (!(dwFlags & 0x100 /*OD_REOPEN*/))
{
lpFile=AkelPad.MemRead(lParam + (_X64?16:8) /*offsetof(NOPENDOCUMENT, wszFile)*/, 2 /*DT_QWORD*/);
pFile=AkelPad.MemRead(lpFile, 1 /*DT_UNICODE*/);
pExt=AkelPad.GetFilePath(pFile, 4 /*CPF_FILEEXT*/).toLowerCase();
pContent="";
pCharset="";
nNewDefaultCodepage=0;
for (i=0; i < aDetectArray.length; ++i)
{
if (aDetectArray[i][0])
{
if ((nOffset=aDetectArray[i][0].indexOf(pExt)) != -1 &&
(aDetectArray[i][0].substr(nOffset + pExt.length, 1) == "" ||
aDetectArray[i][0].substr(nOffset + pExt.length, 1) == ","))
{
pCharset=aDetectArray[i][3];
}
else continue;
}
if (aDetectArray[i][1])
{
if (!pContent)
pContent=AkelPad.ReadFile(pFile, 0x1C /*ADT_DETECTCODEPAGE|ADT_DETECTBOM|ADT_NOMESSAGES*/, 0, 0, nContentBuffer);
oPattern=new RegExp(aDetectArray[i][1], aDetectArray[i][2]);
if (aMatch=pContent.match(oPattern))
{
if (aDetectArray[i][3].substr(0, 1) == "$" || aDetectArray[i][3].substr(0, 1) == "\\")
pCharset=aMatch[parseInt(aDetectArray[i][3].substr(1))];
else
pCharset=aDetectArray[i][3];
}
else continue;
}
if (pCharset)
{
nNewDefaultCodepage=parseInt(pCharset);
if (isNaN(nNewDefaultCodepage))
{
pCharset=pCharset.toLowerCase();
nNewDefaultCodepage=GetCodepageByName(pCharset);
}
break;
}
}
if (i < aDetectArray.length && nNewDefaultCodepage)
{
nNewBOM=aDetectArray[i][4];
nNewLangCodepageRecognition=aDetectArray[i][5];
if (nNewLangCodepageRecognition != -1)
{
if (nNewLangCodepageRecognition == -2)
{
if (nNewBOM == -1)
dwFlags|=0x8 /*OD_ADT_DETECTBOM*/;
else
{
lpbBOM=AkelPad.MemRead(lParam + (_X64?32:16) /*offsetof(NOPENDOCUMENT, bBOM)*/, 2 /*DT_QWORD*/);
AkelPad.MemCopy(lpbBOM, nNewBOM, 3 /*DT_DWORD*/);
dwFlags&=~0x8 /*OD_ADT_DETECTBOM*/;
}
lpnCodePage=AkelPad.MemRead(lParam + (_X64?24:12) /*offsetof(NOPENDOCUMENT, nCodePage)*/, 2 /*DT_QWORD*/);
AkelPad.MemCopy(lpnCodePage, nNewDefaultCodepage, 3 /*DT_DWORD*/);
AkelPad.MemCopy(lpdwFlags, dwFlags & ~0x6 /*OD_ADT_REGCODEPAGE|OD_ADT_DETECTCODEPAGE*/, 3 /*DT_DWORD*/);
nNewDefaultCodepage=0;
}
else
{
nLangCodepageRecognition=AkelPad.SendMessage(hMainWnd, 1222 /*AKD_GETMAININFO*/, 183 /*MI_LANGCODEPAGERECOGNITION*/, 0);
AkelPad.SendMessage(hMainWnd, 1219 /*AKD_SETMAININFO*/, 183 /*MIS_LANGCODEPAGERECOGNITION*/, nNewLangCodepageRecognition);
}
}
if (nNewDefaultCodepage)
{
nDefaultCodepage=AkelPad.SendMessage(hMainWnd, 1222 /*AKD_GETMAININFO*/, 177 /*MI_DEFAULTCODEPAGE*/, 0);
AkelPad.SendMessage(hMainWnd, 1219 /*AKD_SETMAININFO*/, 177 /*MIS_DEFAULTCODEPAGE*/, nNewDefaultCodepage);
}
}
}
}
else if (uMsg == 0x436 /*AKDN_OPENDOCUMENT_FINISH*/)
{
if (nDefaultCodepage)
AkelPad.SendMessage(hMainWnd, 1219 /*AKD_SETMAININFO*/, 177 /*MIS_DEFAULTCODEPAGE*/, nDefaultCodepage);
if (nLangCodepageRecognition != -1)
AkelPad.SendMessage(hMainWnd, 1219 /*AKD_SETMAININFO*/, 183 /*MIS_LANGCODEPAGERECOGNITION*/, nLangCodepageRecognition);
}
}
function GetCodepageByName(pName)
{
switch (pName)
{
case "ibm037": return 37;
case "ibm437": return 437;
case "ibm500": return 500;
case "asmo-708": return 708;
case "dos-720": return 720;
case "ibm737": return 737;
case "ibm775": return 775;
case "ibm850": return 850;
case "ibm852": return 852;
case "ibm855": return 855;
case "ibm857": return 857;
case "ibm00858": return 858;
case "ibm860": return 860;
case "ibm861": return 861;
case "dos-862": return 862;
case "ibm863": return 863;
case "ibm864": return 864;
case "ibm865": return 865;
case "cp866": return 866;
case "ibm869": return 869;
case "ibm870": return 870;
case "windows-874": return 874;
case "cp875": return 875;
case "shift_jis": return 932;
case "gb2312": return 936;
case "ks_c_5601-1987": return 949;
case "big5": return 950;
case "ibm1026": return 1026;
case "ibm01047": return 1047;
case "ibm01140": return 1140;
case "ibm01141": return 1141;
case "ibm01142": return 1142;
case "ibm01143": return 1143;
case "ibm01144": return 1144;
case "ibm01145": return 1145;
case "ibm01146": return 1146;
case "ibm01147": return 1147;
case "ibm01148": return 1148;
case "ibm01149": return 1149;
case "utf-16le": return 1200;
case "utf-16be": return 1201;
case "windows-1250": return 1250;
case "windows-1251": return 1251;
case "windows-1252": return 1252;
case "windows-1253": return 1253;
case "windows-1254": return 1254;
case "windows-1255": return 1255;
case "windows-1256": return 1256;
case "windows-1257": return 1257;
case "windows-1258": return 1258;
case "johab": return 1361;
case "macintosh": return 10000;
case "x-mac-japanese": return 10001;
case "x-mac-chinesetrad": return 10002;
case "x-mac-korean": return 10003;
case "x-mac-arabic": return 10004;
case "x-mac-hebrew": return 10005;
case "x-mac-greek": return 10006;
case "x-mac-cyrillic": return 10007;
case "x-mac-chinesesimp": return 10008;
case "x-mac-romanian": return 10010;
case "x-mac-ukrainian": return 10017;
case "x-mac-thai": return 10021;
case "x-mac-ce": return 10029;
case "x-mac-icelandic": return 10079;
case "x-mac-turkish": return 10081;
case "x-mac-croatian": return 10082;
case "utf-32le": return 12000;
case "utf-32be": return 12001;
case "x-chinese_cns": return 20000;
case "x-cp20001": return 20001;
case "x_chinese-eten": return 20002;
case "x-cp20003": return 20003;
case "x-cp20004": return 20004;
case "x-cp20005": return 20005;
case "x-ia5": return 20105;
case "x-ia5-german": return 20106;
case "x-ia5-swedish": return 20107;
case "x-ia5-norwegian": return 20108;
case "us-ascii": return 20127;
case "x-cp20261": return 20261;
case "x-cp20269": return 20269;
case "ibm273": return 20273;
case "ibm277": return 20277;
case "ibm278": return 20278;
case "ibm280": return 20280;
case "ibm284": return 20284;
case "ibm285": return 20285;
case "ibm290": return 20290;
case "ibm297": return 20297;
case "ibm420": return 20420;
case "ibm423": return 20423;
case "ibm424": return 20424;
case "x-ebcdic-koreanextended": return 20833;
case "ibm-thai": return 20838;
case "koi8-r": return 20866;
case "ibm871": return 20871;
case "ibm880": return 20880;
case "ibm905": return 20905;
case "ibm00924": return 20924;
case "euc-jp": return 20932;
case "x-cp20936": return 20936;
case "x-cp20949": return 20949;
case "cp1025": return 21025;
case "koi8-u": return 21866;
case "iso-8859-1": return 28591;
case "iso-8859-2": return 28592;
case "iso-8859-3": return 28593;
case "iso-8859-4": return 28594;
case "iso-8859-5": return 28595;
case "iso-8859-6": return 28596;
case "iso-8859-7": return 28597;
case "iso-8859-8": return 28598;
case "iso-8859-9": return 28599;
case "iso-8859-13": return 28603;
case "iso-8859-15": return 28605;
case "x-europa": return 29001;
case "iso-8859-8-i": return 38598;
case "iso-2022-jp": return 50220;
case "csiso2022jp": return 50221;
case "iso-2022-jp": return 50222;
case "iso-2022-kr": return 50225;
case "x-cp50227": return 50227;
case "euc-jp": return 51932;
case "euc-cn": return 51936;
case "euc-kr": return 51949;
case "hz-gb-2312": return 52936;
case "gb18030": return 54936;
case "x-iscii-de": return 57002;
case "x-iscii-be": return 57003;
case "x-iscii-ta": return 57004;
case "x-iscii-te": return 57005;
case "x-iscii-as": return 57006;
case "x-iscii-or": return 57007;
case "x-iscii-ka": return 57008;
case "x-iscii-ma": return 57009;
case "x-iscii-gu": return 57010;
case "x-iscii-pa": return 57011;
case "utf-7": return 65000;
case "utf-8": return 65001;
}
return 0;
}
|
|
|