kcharsets.cpp
00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
#include "kcharsets.h"
00020
00021
#include "kentities.c"
00022
00023
#include <kapplication.h>
00024
#include <kglobal.h>
00025
#include <klocale.h>
00026
#include <kconfig.h>
00027
00028
#include <qfontinfo.h>
00029
#include <qstrlist.h>
00030
#include <qfontdatabase.h>
00031
#include <kdebug.h>
00032
00033
#include <qtextcodec.h>
00034
#include <qmap.h>
00035
#include <qcstring.h>
00036
00037
#include <assert.h>
00038
00039
#define CHARSETS_COUNT 33
00040
00041
static const char *
const language_names[] = {
00042 I18N_NOOP(
"Other" ),
00043 I18N_NOOP(
"Arabic" ),
00044 I18N_NOOP(
"Baltic" ),
00045 I18N_NOOP(
"Central European" ),
00046 I18N_NOOP(
"Chinese Simplified" ),
00047 I18N_NOOP(
"Chinese Traditional" ),
00048 I18N_NOOP(
"Cyrillic" ),
00049 I18N_NOOP(
"Greek" ),
00050 I18N_NOOP(
"Hebrew" ),
00051 I18N_NOOP(
"Japanese" ),
00052 I18N_NOOP(
"Korean" ),
00053 I18N_NOOP(
"Thai" ),
00054 I18N_NOOP(
"Turkish" ),
00055 I18N_NOOP(
"Western European" ),
00056 I18N_NOOP(
"Tamil" ),
00057 I18N_NOOP(
"Unicode" ),
00058 I18N_NOOP(
"Northern Saami" )
00059 };
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
static const char*
const charsets_for_encoding[] = {
00070
"koi8-r",
"koi8-r",
"cp 1251",
"koi8-u",
"iso-8859-5", 0,
00071
"koi8-u",
"koi8-u",
"cp 1251",
"iso-8859-5",
"koi8-r", 0,
00072
"iso 8859-1",
"iso8859-1",
"iso8859-15", 0,
00073
"iso 8859-2",
"iso8859-2",
"unicode",
"iso8859-1", 0,
00074
"iso 8859-3",
"iso8859-3",
"unicode",
"iso8859-1", 0,
00075
"iso 8859-4",
"iso8859-4",
"unicode",
"iso8859-13",
"iso8859-1", 0,
00076
"iso 8859-5",
"iso8859-5",
"koi8-u",
"koi8-r", 0,
00077
"iso 8859-6",
"unicode",
"iso8859-6", 0,
00078
"iso 8859-7",
"iso8859-7", 0,
00079
"iso 8859-8",
"iso8859-8", 0,
00080
"iso 8859-8-i",
"iso8859-8", 0,
00081
"iso 8859-9",
"iso8859-9",
"unicode",
"iso8859-1", 0,
00082
"iso 8859-11",
"iso8859-11", 0,
00083
"iso 8859-13",
"iso8859-13",
"unicode",
"iso8859-4",
"iso8859-1", 0,
00084
"iso 8859-15",
"iso8859-15",
"unicode",
"iso8859-1", 0,
00085
"utf8",
"unicode",
"iso8859-1", 0,
00086
"utf16",
"unicode",
"iso8859-1", 0,
00087
"iso-10646-ucs-2",
"unicode",
"iso8859-1", 0,
00088
"cp 1250",
"iso8859-2", 0,
00089
"cp 1251",
"cp 1251",
"koi8-u",
"koi8-r",
"iso8859-5", 0,
00090
"cp 1252",
"iso8859-1", 0,
00091
"cp 1253",
"iso8859-7", 0,
00092
"cp 1254",
"iso8859-9", 0,
00093
"cp 1255",
"iso8859-8", 0,
00094
"cp 1256",
"unicode",
"iso8859-6", 0,
00095
"cp 1257",
"iso8859-13",
"iso8859-4", 0,
00096
"ibm850",
"ibm850",
"unicode",
"iso8859-1", 0,
00097
"ibm852",
"unicode",
"iso-8859-2", 0,
00098
"ibm866",
"ibm866",
"cp 1251",
"koi8-u",
"koi8-r",
"iso8859-5", 0,
00099
"tis620",
"iso8859-11", 0,
00100
"eucjp",
"eucjp",
"unicode",
"iso8859-1", 0,
00101
"sjis",
"eucjp",
"unicode",
"iso8859-1", 0,
00102
"jis7",
"eucjp",
"unicode",
"iso8859-1", 0,
00103
"big5",
"big5",
"unicode",
"iso8859-1", 0,
00104
"gbk",
"gb2312.1980-0",
"gbk-0",
"unicode",
"iso8859-1", 0,
00105
"gb18030",
"gb18030.2000-1",
"gb18030.2000-0",
"unicode",
"gbk-0",
"gb2313.1980-0",
"iso8859-1", 0,
00106
"gb2312",
"gb2312.1980-0",
"unicode",
"iso8859-1", 0,
00107
"euckr",
"euckr",
"unicode",
"iso8859-1", 0,
00108
"tscii",
"tscii", 0,
00109
"pt 154",
"pt 154",
"cp 1251",
"koi8-u",
"koi8-r",
"iso8859-5", 0,
00110
"winsami2",
"winsami2",
"cp1252",
"unicode", 0,
00111 0 };
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
static struct LanguageForEncoding
00132 {
00133
const char* index;
00134
int data;
00135 }
const language_for_encoding[] = {
00136 {
"iso 8859-1", 13 },
00137 {
"iso 8859-15", 13 },
00138 {
"cp 1252", 13 },
00139 {
"ibm850", 13 },
00140 {
"iso 8859-2", 3 },
00141 {
"iso 8859-3", 3 },
00142 {
"iso 8859-4", 2 },
00143 {
"iso 8859-13", 2 },
00144 {
"cp 1250", 3 },
00145 {
"cp 1254", 12 },
00146 {
"cp 1257", 2 },
00147 {
"ibm852", 3 },
00148 {
"koi8-r", 6 },
00149 {
"iso 8859-5", 6 },
00150 {
"cp 1251", 6 },
00151 {
"koi8-u", 6 },
00152 {
"pt 154", 6 },
00153 {
"ibm866", 6 },
00154 {
"big5", 5 },
00155 {
"gb18030", 4 },
00156 {
"gbk", 4 },
00157 {
"gb2312", 4 },
00158 {
"euckr", 10 },
00159 {
"sjis", 9 },
00160 {
"jis7", 9 },
00161 {
"eucjp", 9 },
00162 {
"iso 8859-7", 7 },
00163 {
"cp 1253", 7 },
00164 {
"iso 8859-6", 1 },
00165 {
"cp 1256", 1 },
00166 {
"iso 8859-8", 8 },
00167 {
"iso 8859-8-i", 8 },
00168 {
"cp 1255", 8 },
00169 {
"iso 8859-9", 12 },
00170 {
"tis620", 11 },
00171 {
"iso 8859-11", 11 },
00172 {
"utf8", 15 },
00173 {
"utf16", 15 },
00174 {
"utf7", 15 },
00175 {
"ucs2", 15 },
00176 {
"iso-10646-ucs-2", 15 },
00177 {
"winsami2", 16},
00178 { 0, 0 } };
00179
00180
00181
static struct Builtin
00182 {
00183
const char* index;
00184
const char* data;
00185 }
const builtin[] = {
00186 {
"iso-ir-111",
"koi8-r" },
00187 {
"koi8-ru",
"koi8-u" },
00188 {
"koi8r",
"koi8-r" },
00189 {
"koi8u",
"koi8-u" },
00190 {
"koi unified",
"koi8-r" },
00191 {
"us-ascii",
"iso 8859-1" },
00192 {
"usascii",
"iso 8859-1" },
00193 {
"x-utf-8",
"utf-8" },
00194 {
"x-utf-7",
"utf-7" },
00195 {
"unicode-1-1-utf-7",
"utf-7" },
00196 {
"utf-16",
"iso-10646-ucs-2" },
00197 {
"utf16",
"iso-10646-ucs-2" },
00198 {
"ucs2",
"iso-10646-ucs-2" },
00199 {
"iso10646-1",
"iso-10646-ucs-2" },
00200 {
"gb18030.2000-1",
"gb18030" },
00201 {
"gb18030.2000-0",
"gb18030" },
00202 {
"gbk-0",
"gbk" },
00203 {
"gb2312",
"gbk" },
00204 {
"gb2312.1980-0",
"gbk" },
00205 {
"big5-0",
"big5" },
00206 {
"euc-kr",
"euckr" },
00207 {
"x-euc-kr",
"euckr" },
00208 {
"euc-jp",
"eucjp" },
00209 {
"x-euc-jp",
"eucjp" },
00210 {
"jisx0201.1976-0",
"eucjp" },
00211 {
"jisx0208.1983-0",
"eucjp" },
00212 {
"jisx0208.1990-0",
"eucjp" },
00213 {
"jisx0208.1997-0",
"eucjp" },
00214 {
"jisx0212.1990-0",
"eucjp" },
00215 {
"jisx0213.2000-1",
"eucjp" },
00216 {
"jisx0213.2000-2",
"eucjp" },
00217 {
"shift_jis",
"sjis" },
00218 {
"shift-jis",
"sjis" },
00219 {
"x-sjis",
"sjis" },
00220 {
"iso-2022-jp",
"jis7" },
00221 {
"windows850",
"ibm850" },
00222 {
"windows866",
"ibm866" },
00223 {
"windows1251",
"cp 1251" },
00224 {
"windows1252",
"cp 1252" },
00225 {
"windows1253",
"cp 1253" },
00226 {
"windows1254",
"cp 1254" },
00227 {
"windows1255",
"cp 1255" },
00228 {
"windows1256",
"cp 1256" },
00229 {
"windows1257",
"cp 1257" },
00230 {
"windows-850",
"ibm850" },
00231 {
"windows-866",
"ibm866" },
00232 {
"windows-1250",
"cp 1250" },
00233 {
"windows-1251",
"cp 1251" },
00234 {
"windows-1252",
"cp 1252" },
00235 {
"windows-1253",
"cp 1253" },
00236 {
"windows-1254",
"cp 1254" },
00237 {
"windows-1255",
"cp 1255" },
00238 {
"windows-1256",
"cp 1256" },
00239 {
"windows-1257",
"cp 1257" },
00240 {
"x-windows-850",
"ibm850" },
00241 {
"x-windows-866",
"ibm866" },
00242 {
"x-windows-1250",
"cp 1250" },
00243 {
"x-windows-1251",
"cp 1251" },
00244 {
"x-windows-1252",
"cp 1252" },
00245 {
"x-windows-1253",
"cp 1253" },
00246 {
"x-windows-1254",
"cp 1254" },
00247 {
"x-windows-1255",
"cp 1255" },
00248 {
"x-windows-1256",
"cp 1256" },
00249 {
"x-windows-1257",
"cp 1257" },
00250 {
"cp850",
"ibm850" },
00251 {
"cp866",
"ibm866" },
00252 {
"cp-850",
"ibm850" },
00253 {
"cp-866",
"ibm866" },
00254 {
"cp-1250",
"cp 1250" },
00255 {
"cp-1251",
"cp 1251" },
00256 {
"cp-1252",
"cp 1252" },
00257 {
"cp-1253",
"cp 1253" },
00258 {
"cp-1254",
"cp 1254" },
00259 {
"cp-1255",
"cp 1255" },
00260 {
"cp-1256",
"cp 1256" },
00261 {
"cp-1257",
"cp 1257" },
00262 {
"cp-10000",
"apple roman" },
00263 {
"x-cp-850",
"ibm850" },
00264 {
"x-cp-866",
"ibm866" },
00265 {
"x-cp-1250",
"cp 1250" },
00266 {
"x-cp-1251",
"cp 1251" },
00267 {
"x-cp-1252",
"cp 1252" },
00268 {
"x-cp-1253",
"cp 1253" },
00269 {
"x-cp-1254",
"cp 1254" },
00270 {
"x-cp-1255",
"cp 1255" },
00271 {
"x-cp-1256",
"cp 1256" },
00272 {
"x-cp-1257",
"cp 1257" },
00273 {
"x-cp-10000",
"apple roman" },
00274 {
"tis620",
"iso 8859-11" },
00275 {
"tis-620",
"iso 8859-11" },
00276 {
"thai-tis620",
"iso 8859-11" },
00277 {
"windows-874",
"iso 8859-11" },
00278 {
"windows874",
"iso 8859-11" },
00279 {
"x-windows-874",
"iso 8859-11" },
00280 {
"cp874",
"iso 8859-11" },
00281 {
"cp-874",
"iso 8859-11" },
00282 {
"x-cp-874",
"iso 8859-11" },
00283 {
"ksc5601.1987-0",
"euckr" },
00284 {
"ks_c_5601-1987",
"euckr" },
00285 {
"iso-8859-1",
"iso 8859-1" },
00286 {
"iso-8859-2",
"iso 8859-2" },
00287 {
"iso-8859-3",
"iso 8859-3" },
00288 {
"iso-8859-4",
"iso 8859-4" },
00289 {
"iso-8859-5",
"iso 8859-5" },
00290 {
"iso-8859-6",
"iso 8859-6" },
00291 {
"iso-8859-7",
"iso 8859-7" },
00292 {
"iso-8859-8",
"iso 8859-8" },
00293 {
"iso-8859-9",
"iso 8859-9" },
00294 {
"iso-8859-10",
"iso 8859-10" },
00295 {
"iso-8859-11",
"iso 8859-11" },
00296 {
"iso-8859-12",
"iso 8859-12" },
00297 {
"iso-8859-13",
"iso 8859-13" },
00298 {
"iso-8859-14",
"iso 8859-14" },
00299 {
"iso-8859-15",
"iso 8859-15" },
00300 {
"tscii",
"tscii" },
00301 {
"paratype-154",
"pt 154" },
00302 {
"pt-154",
"pt 154" },
00303 {
"x-winsami2",
"winsami2" },
00304 {
"x-mac-roman",
"apple roman" },
00305 {
"macintosh",
"apple roman" },
00306 {
"mac",
"apple roman" },
00307 { 0, 0 }};
00308
00309
00310
00311
static struct Aliases
00312 {
00313
const char* index;
00314
const char* data;
00315 }
const aliases[] = {
00316 {
"cp852",
"ibm852" },
00317 {
"cp-852",
"ibm852" },
00318 {
"x-cp-852",
"ibm852" },
00319 {
"windows852",
"ibm852" },
00320 {
"windows-852",
"ibm852" },
00321 {
"x-windows-852",
"ibm852" },
00322 { 0, 0 }};
00323
00324
00325
00326
00327
static struct ConversionHints
00328 {
00329
const char* index;
00330
const char* data;
00331 }
const conversion_hints[] = {
00332 {
"cp1250",
"iso-8859-2" },
00333 {
"koi8-r",
"iso-8859-5" },
00334 {
"koi8-u",
"koi8-r" },
00335 { 0, 0 }};
00336
00337
00338
00339
00340
template<
typename T,
typename Data >
00341
static Data kcharsets_array_search(
const T* start,
const char* entry )
00342 {
00343
for(
const T* pos = start;
00344 pos->index != 0;
00345 ++pos )
00346
if( qstrcmp( pos->index, entry ) == 0 )
00347
return pos->data;
00348
return 0;
00349 }
00350
00351
00352
class KCharsetsPrivate
00353 {
00354
public:
00355 KCharsetsPrivate(
KCharsets* _kc)
00356 : codecForNameDict(43, false)
00357 {
00358 db = 0;
00359 kc = _kc;
00360 }
00361 ~KCharsetsPrivate()
00362 {
00363
delete db;
00364 }
00365
QFontDatabase *db;
00366
QAsciiDict<QTextCodec> codecForNameDict;
00367
KCharsets* kc;
00368 };
00369
00370
00371
00372 KCharsets::KCharsets()
00373 {
00374 d =
new KCharsetsPrivate(
this);
00375 }
00376
00377 KCharsets::~KCharsets()
00378 {
00379
delete d;
00380 }
00381
00382 QChar KCharsets::fromEntity(
const QString &str)
00383 {
00384
QChar res = QChar::null;
00385
00386
int pos = 0;
00387
if(str[pos] ==
'&') pos++;
00388
00389
00390
if (str[pos] ==
'#' && str.
length()-pos > 1) {
00391
bool ok;
00392 pos++;
00393
if (str[pos] ==
'x' || str[pos] ==
'X') {
00394 pos++;
00395
00396
QString tmp(str.
unicode()+pos, str.
length()-pos);
00397 res = tmp.
toInt(&ok, 16);
00398 }
else {
00399
00400
QString tmp(str.
unicode()+pos, str.
length()-pos);
00401 res = tmp.
toInt(&ok, 10);
00402 }
00403
return res;
00404 }
00405
00406
const entity *e = kde_findEntity(str.
ascii(), str.
length());
00407
00408
if(!e)
00409 {
00410
00411
return QChar::null;
00412 }
00413
00414
00415
return QChar(e->code);
00416 }
00417
00418 QChar KCharsets::fromEntity(
const QString &str,
int &len)
00419 {
00420
00421
00422 len = 8;
00423
while(len > 0)
00424 {
00425
QString tmp = str.
left(len);
00426
QChar res =
fromEntity(tmp);
00427
if( res != QChar::null )
return res;
00428 len--;
00429 }
00430
return QChar::null;
00431 }
00432
00433
00434 QString KCharsets::toEntity(
const QChar &ch)
00435 {
00436
QString ent;
00437 ent.
sprintf(
"�x%x;", ch.
unicode());
00438
return ent;
00439 }
00440
00441 QString KCharsets::resolveEntities(
const QString &input )
00442 {
00443
QString text = input;
00444
const QChar *p = text.
unicode();
00445
const QChar *end = p + text.
length();
00446
const QChar *ampersand = 0;
00447
bool scanForSemicolon =
false;
00448
00449
for ( ; p < end; ++p ) {
00450
const QChar ch = *p;
00451
00452
if ( ch ==
'&' ) {
00453 ampersand = p;
00454 scanForSemicolon =
true;
00455
continue;
00456 }
00457
00458
if ( ch !=
';' || scanForSemicolon ==
false )
00459
continue;
00460
00461 assert( ampersand );
00462
00463 scanForSemicolon =
false;
00464
00465
const QChar *entityBegin = ampersand + 1;
00466
00467
const uint entityLength = p - entityBegin;
00468
if ( entityLength == 0 )
00469
continue;
00470
00471
const QChar entityValue =
KCharsets::fromEntity(
QConstString( entityBegin, entityLength ).string() );
00472
if ( entityValue.
isNull() )
00473
continue;
00474
00475
const uint ampersandPos = ampersand - text.
unicode();
00476
00477 text[ (
int)ampersandPos ] = entityValue;
00478 text.
remove( ampersandPos + 1, entityLength + 1 );
00479 p = text.
unicode() + ampersandPos;
00480 end = text.
unicode() + text.
length();
00481 ampersand = 0;
00482 }
00483
00484
return text;
00485 }
00486
00487 QStringList KCharsets::availableEncodingNames()
00488 {
00489
QStringList available;
00490
00491
const char*
const* pos = charsets_for_encoding;
00492
while( *pos != 0 ) {
00493
00494
00495
00496
00497
for(
const char*
const* charsets = pos + 1;
00498 *charsets != 0;
00499 ++charsets ) {
00500
00501
#ifdef __GNUC__
00502
#warning FIXME?
00503
#endif
00504
if(
true ) {
00505
00506 available.append( QString::fromLatin1( *pos ));
00507
break;
00508 }
00509 }
00510
while( *pos != 0 )
00511 ++pos;
00512 ++pos;
00513 }
00514
return available;
00515 }
00516
00517 QString KCharsets::languageForEncoding(
const QString &encoding )
00518 {
00519
int lang = kcharsets_array_search< LanguageForEncoding, int >
00520 ( language_for_encoding, encoding.
latin1());
00521
return i18n( language_names[lang] );
00522 }
00523
00524 QString KCharsets::encodingForName(
const QString &descriptiveName )
00525 {
00526
const int left = descriptiveName.
findRev(
'(' );
00527
00528
if (left<0)
00529
return descriptiveName.
stripWhiteSpace();
00530
00531
QString name(descriptiveName.
mid(left+1));
00532
00533
const int right = name.findRev(
')' );
00534
00535
if (right<0)
00536
return name;
00537
00538
return name.left(right).stripWhiteSpace();
00539 }
00540
00541 QStringList KCharsets::descriptiveEncodingNames()
00542 {
00543
QStringList encodings =
availableEncodingNames();
00544 QStringList::Iterator it;
00545
for( it = encodings.begin(); it != encodings.end(); ++it ) {
00546
QString lang =
KGlobal::charsets()->
languageForEncoding( *it );
00547 *it = i18n(
"Descriptive Encoding Name",
"%1 ( %2 )") .
arg(lang) .arg(*it);
00548 }
00549 encodings.
sort();
00550
return encodings;
00551 }
00552
00553 QTextCodec *
KCharsets::codecForName(
const QString &n)
const
00554
{
00555
bool b;
00556
return codecForName( n, b );
00557 }
00558
00559 QTextCodec *
KCharsets::codecForName(
const QString &n,
bool &ok)
const
00560
{
00561 ok =
true;
00562
00563
QTextCodec* codec = 0;
00564
00565
if((codec = d->codecForNameDict[n.
isEmpty() ?
"->locale<-" : n.
latin1()]))
00566
return codec;
00567
00568
if (n.
isEmpty()) {
00569 codec =
KGlobal::locale()->
codecForEncoding();
00570 d->codecForNameDict.replace(
"->locale<-", codec);
00571
return codec;
00572 }
00573
00574
QCString name = n.
lower().latin1();
00575
QCString key = name;
00576
if (name.right(8) ==
"_charset")
00577 name.truncate(name.length()-8);
00578
00579
if (name.isEmpty()) {
00580 ok =
false;
00581
return QTextCodec::codecForName(
"iso8859-1");
00582 }
00583
00584 codec = QTextCodec::codecForName(name);
00585
00586
if(codec) {
00587 d->codecForNameDict.replace(key, codec);
00588
return codec;
00589 }
00590
00591
00592
00593
QCString cname = kcharsets_array_search< Builtin, const char* >( builtin, name.data());
00594
00595
if(!cname.
isEmpty())
00596 codec = QTextCodec::codecForName(cname);
00597
00598
if(codec)
00599 {
00600 d->codecForNameDict.replace(key, codec);
00601
return codec;
00602 }
00603
00604
00605
QString dir;
00606 {
00607
KConfigGroupSaver cfgsav( KGlobal::config(),
"i18n" );
00608 dir =
KGlobal::config()->
readPathEntry(
"i18ndir", QString::fromLatin1(
"/usr/share/i18n/charmaps"));
00609 dir +=
"/";
00610 }
00611
00612
00613
00614 cname = kcharsets_array_search< Aliases, const char* >( aliases, name.data());
00615
00616
if(cname.
isEmpty())
00617 cname = name;
00618 cname = cname.
upper();
00619
00620 codec = QTextCodec::loadCharmapFile((
QString)(dir + cname.data()));
00621
00622
if(codec) {
00623 d->codecForNameDict.replace(key, codec);
00624
return codec;
00625 }
00626
00627
00628
00629 cname = cname.
lower();
00630 cname = kcharsets_array_search< ConversionHints, const char* >( conversion_hints, (
const char*)cname );
00631
00632
if(!cname.
isEmpty())
00633 codec = QTextCodec::codecForName(cname);
00634
00635
if(codec) {
00636 d->codecForNameDict.replace(key, codec);
00637
return codec;
00638 }
00639
00640
00641 ok =
false;
00642
return QTextCodec::codecForName(
"iso8859-1");
00643 }
This file is part of the documentation for kdecore Library Version 3.4.0.