00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
#include "regexp.h"
00023
00024
#include "lexer.h"
00025
#include <stdio.h>
00026
#include <stdlib.h>
00027
#include <string.h>
00028
00029
using namespace KJS;
00030
00031 RegExp::RegExp(
const UString &p,
int f)
00032 : pat(p), flgs(f), m_notEmpty(false)
00033 {
00034
00035
00036
00037
UString intern;
00038
if (p.
find(
'\\') >= 0) {
00039
bool escape =
false;
00040
for (
int i = 0; i < p.
size(); ++i) {
00041
UChar c = p[i];
00042
if (escape) {
00043 escape =
false;
00044
00045
if (c ==
'u' && i + 4 < p.size()) {
00046
int c0 = p[i+1].
unicode();
00047
int c1 = p[i+2].unicode();
00048
int c2 = p[i+3].unicode();
00049
int c3 = p[i+4].unicode();
00050
if (Lexer::isHexDigit(c0) && Lexer::isHexDigit(c1) &&
00051 Lexer::isHexDigit(c2) && Lexer::isHexDigit(c3)) {
00052 c = Lexer::convertUnicode(c0, c1, c2, c3);
00053 intern +=
UString(&c, 1);
00054 i += 4;
00055
continue;
00056 }
00057 }
00058 intern +=
UString(
'\\');
00059 intern += UString(&c, 1);
00060 }
else {
00061
if (c ==
'\\')
00062 escape =
true;
00063
else
00064 intern +=
UString(&c, 1);
00065 }
00066 }
00067 }
else {
00068 intern = p;
00069 }
00070
00071
#ifdef HAVE_PCREPOSIX
00072
int pcreflags = 0;
00073
const char *perrormsg;
00074
int errorOffset;
00075
00076
if (flgs & IgnoreCase)
00077 pcreflags |= PCRE_CASELESS;
00078
00079
if (flgs & Multiline)
00080 pcreflags |= PCRE_MULTILINE;
00081
00082 pcregex = pcre_compile(intern.
ascii(), pcreflags,
00083 &perrormsg, &errorOffset, NULL);
00084
#ifndef NDEBUG
00085
if (!pcregex)
00086 fprintf(stderr,
"KJS: pcre_compile() failed with '%s'\n", perrormsg);
00087
#endif
00088
00089
#ifdef PCRE_INFO_CAPTURECOUNT
00090
00091
int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
00092
if (rc != 0)
00093
#endif
00094
nrSubPatterns = 0;
00095
00096
#else
00097
00098 nrSubPatterns = 0;
00099
int regflags = 0;
00100
#ifdef REG_EXTENDED
00101
regflags |= REG_EXTENDED;
00102
#endif
00103
#ifdef REG_ICASE
00104
if ( f & IgnoreCase )
00105 regflags |= REG_ICASE;
00106
#endif
00107
00108
00109
00110
00111
00112
00113
if (regcomp(&preg, intern.
ascii(), regflags) != 0) {
00114
00115 regcomp(&preg,
"", regflags);
00116 }
00117
#endif
00118
}
00119
00120 RegExp::~RegExp()
00121 {
00122
#ifdef HAVE_PCREPOSIX
00123
if (pcregex)
00124 pcre_free(pcregex);
00125
#else
00126
00127 regfree(&preg);
00128
#endif
00129
}
00130
00131
UString RegExp::match(
const UString &s,
int i,
int *pos,
int **ovector)
00132 {
00133
if (i < 0)
00134 i = 0;
00135
if (ovector)
00136 *ovector = 0L;
00137
int dummyPos;
00138
if (!pos)
00139 pos = &dummyPos;
00140 *pos = -1;
00141
if (i > s.
size() || s.
isNull())
00142
return UString::null;
00143
00144
#ifdef HAVE_PCREPOSIX
00145
CString buffer(s.
cstring());
00146
int bufferSize = buffer.
size();
00147
int ovecsize = (nrSubPatterns+1)*3;
00148
if (ovector) *ovector =
new int[ovecsize];
00149
if (!pcregex)
00150
return UString::null;
00151
00152
if (pcre_exec(pcregex, NULL, buffer.
c_str(), bufferSize, i,
00153 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED) : 0,
00154 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00155 {
00156
00157
if ((flgs & Global) && m_notEmpty && ovector)
00158 {
00159
00160
00161
00162
#ifndef NDEBUG
00163
fprintf(stderr,
"No match after m_notEmpty. +1 and keep going.\n");
00164
#endif
00165
m_notEmpty = 0;
00166
if (pcre_exec(pcregex, NULL, buffer.
c_str(), bufferSize, i+1, 0,
00167 ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
00168
return UString::null;
00169 }
00170
else
00171
return UString::null;
00172 }
00173
00174
00175
00176
if (!ovector)
00177
return UString::null;
00178
#else
00179
const uint maxMatch = 10;
00180 regmatch_t rmatch[maxMatch];
00181
00182
char *str = strdup(s.
ascii());
00183
if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
00184 free(str);
00185
return UString::null;
00186 }
00187 free(str);
00188
00189
if (!ovector) {
00190 *pos = rmatch[0].rm_so + i;
00191
return s.
substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
00192 }
00193
00194
00195 nrSubPatterns = 0;
00196
for(uint j = 1; j < maxMatch && rmatch[j].rm_so >= 0 ; j++)
00197 nrSubPatterns++;
00198
int ovecsize = (nrSubPatterns+1)*3;
00199 *ovector =
new int[ovecsize];
00200
for (uint j = 0; j < nrSubPatterns + 1; j++) {
00201
if (j>maxMatch)
00202
break;
00203 (*ovector)[2*j] = rmatch[j].rm_so + i;
00204 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
00205 }
00206
#endif
00207
00208 *pos = (*ovector)[0];
00209
#ifdef HAVE_PCREPOSIX // TODO check this stuff in non-pcre mode
00210
if ( *pos == (*ovector)[1] && (flgs & Global) )
00211 {
00212
00213 m_notEmpty=
true;
00214 }
00215
#endif
00216
return s.
substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
00217 }
00218
00219
#if 0 // unused
00220
bool RegExp::test(
const UString &s,
int)
00221 {
00222
#ifdef HAVE_PCREPOSIX
00223
int ovector[300];
00224
CString buffer(s.
cstring());
00225
00226
if (s.
isNull() ||
00227 pcre_exec(pcregex, NULL, buffer.
c_str(), buffer.
size(), 0,
00228 0, ovector, 300) == PCRE_ERROR_NOMATCH)
00229
return false;
00230
else
00231
return true;
00232
00233
#else
00234
00235
char *str = strdup(s.
ascii());
00236
int r = regexec(&preg, str, 0, 0, 0);
00237 free(str);
00238
00239
return r == 0;
00240
#endif
00241
}
00242
#endif