SF Area/Download
Docs:
User
Internal/Devel
Support This Project
SourceForge.net Logo

csregex.cpp

Go to the documentation of this file.
00001 //---------------------------------------------------------------------------
00002 //
00003 // Author: Cleo Saulnier
00004 // Date: July 14, 2005
00005 //
00006 // Regular expression engine.
00007 //
00008 // See LICENSE file for legal information.
00009 // Basically public domain, but author keeps moral rights.
00010 // Original author's name above must remain in source.
00011 // Simply put, you can do what you want, but don't misrepresent
00012 // the origins, and contents, of this code.  
00013 //
00014 // http://sourceforge.net/projects/csregex/
00015 //
00016 //---------------------------------------------------------------------------
00023 // We don't use many string functions so
00024 // we don't risk that they don't have a
00025 // unicode capable compiler.  Simply
00026 // define the few functions we do use.
00027 
00028 #include <ctype.h>
00029 #include <string.h>
00030 
00031 #ifdef __BCPLUSPLUS__
00032 #pragma hdrstop
00033 #endif
00034 
00035 #include "csregex.h"
00036 //---------------------------------------------------------------------------
00037 
00038 #ifdef __BCPLUSPLUS__
00039 #pragma package(smart_init)
00040 #endif
00041 
00042 #ifndef _UNICODE
00043 #ifndef __TCHAR_DEFINED
00044 #define _tcslen strlen
00045 #define _tcscpy strcpy
00046 #define _totlower tolower
00047 #define _istdigit isdigit
00048 #define _T(a) a
00049 #endif
00050 #endif
00051 
00052 // This is used when advancing index over offsets.
00053 // In ASCII, you advance by 2 chars.
00054 // In UNICODE, you advance by 1 wchar_t.
00055 #ifdef _UNICODE
00056 #define CSREGEX_OFSWIDTH 1
00057 #else
00058 #define CSREGEX_OFSWIDTH 2 
00059 #endif
00060 
00065 static _TCHAR* CSRegExErrorStr[] = {_T("OK."),
00066   _T("Back references (round brackets) exceed maximum of 9."),
00067   _T("Missing end round bracket."),
00068   _T("Overlapping/redundant character specs."),
00069   _T("Escape character cannot appear at end of string by itself."),
00070   _T("Missing end square bracket."),
00071   _T("Invalid hex characters in escape sequence."),
00072   _T("Invalid repeat format."),
00073   _T("Invalid repeat range."),
00074   _T("Unbalanced round brackets."),
00075   _T("Invalid range."),
00076   _T("Invalid backreference."),
00077   _T("Regular expression is too long.")};
00078 
00079 CSRegEx::CSRegEx()
00080 {
00081   str = NULL;
00082   buf = NULL;
00083   str_len = 0;
00084   buf_len = 0;
00085   buf_size = 0;
00086   bMatchHead = false;
00087   error = -1;
00088   error_code = rge_ok;
00089   error_str = CSRegExErrorStr[0];
00090 }
00091 
00092 CSRegEx::~CSRegEx()
00093 {
00094   if (buf) delete[] buf;
00095 }
00096 
00102 _TUCHAR* CSRegEx::GetCompiledString() const
00103 {
00104   _TUCHAR *b = new _TUCHAR[_tcslen(reinterpret_cast<_TCHAR* const>(buf))+1];
00105   _tcscpy(reinterpret_cast<_TCHAR*>(b),reinterpret_cast<_TCHAR*>(buf));
00106   return b;
00107 }
00108 
00115 void CSRegEx::SetCompiledString(_TUCHAR *str)
00116 {
00117   int len = _tcslen(reinterpret_cast<_TCHAR*>(str))+1;
00118   if (len>buf_size)
00119   {
00120     if (buf_size==0) buf_size = 1024;
00121     else delete[] buf;
00122     while(len>buf_size) buf_size*=2;
00123     buf = new _TUCHAR[buf_size];
00124   }
00125   _tcscpy(reinterpret_cast<_TCHAR*>(buf),reinterpret_cast<_TCHAR*>(str));
00126 }
00127 
00133 void CSRegEx::Put(_TCHAR c)
00134 {
00135   if (buf_len+1==buf_size)
00136   {
00137     _TUCHAR *buf1 = new _TUCHAR[buf_size*2];
00138     memcpy(buf1,buf,buf_size*sizeof(_TUCHAR));
00139     delete[] buf;
00140     buf = buf1;
00141     buf_size*=2;
00142   }
00143   buf[buf_len++]=c;
00144 }
00145 
00146 // If we have unicode, then a wchar_t is large
00147 // enough to hold the offset.
00148 // Otherwise, we need to use 2 chars.
00149 #ifdef _UNICODE
00150 #define PutValue16 Put
00151 #define GetValue16(i) \
00152   static_cast<unsigned int>(buf[i])
00153 #else
00154 
00159 void CSRegEx::PutValue16(unsigned short c)
00160 {
00161   if (buf_len+2>=buf_size)
00162   {
00163     unsigned char *buf1 = new unsigned char[buf_size*2];
00164     memcpy(buf1,buf,buf_size);
00165     delete[] buf;
00166     buf = buf1;
00167     buf_size*=2;
00168   }
00169   // We add 0x0100 because we can't have any 0 values
00170   // in the high order.  
00171   *(reinterpret_cast<unsigned short*>(&buf[buf_len])) = c+0x0100;
00172   buf_len+=2;
00173 }
00174 
00175 #define GetValue16(i) \
00176   static_cast<unsigned int>((*(reinterpret_cast<unsigned short*>(&buf[i])))-0x0100)
00177   
00178 #endif
00179 
00180 #ifndef _UNICODE
00181 bool CSRegEx::Compile(const char *str)
00182 {
00183   return Compile((const unsigned char*)str);
00184 }
00185 #endif
00186 // Places an offset (v) in a previous location (a) in the
00187 // compiled string buffer.
00188 #ifndef _UNICODE
00189 #define PUT_OFFSET(a, v) \
00190         *(reinterpret_cast<unsigned short*>(&buf[(a)])) = (v)+0x0100
00191 #else
00192 #define PUT_OFFSET(a, v) \
00193         buf[(a)] = (v)
00194 #endif
00195 
00210 bool CSRegEx::Compile(const _TUCHAR *str)
00211 {
00212   error_code = rge_ok;
00213   if (buf==NULL)
00214   {
00215     buf_size = 1024;
00216     buf = new _TUCHAR[buf_size];
00217   }
00218   buf_len = 0;
00219   tal = false;
00220   if (str[0]==_T('^'))
00221   {
00222     Put(rg_hed);
00223     str++;
00224   }
00225   str_len = _tcslen(reinterpret_cast<const _TCHAR*>(str));
00226   this->str = str;
00227   opa_cnt=0;
00228   error = -1;
00229   Put(rg_opa);
00230   last_par.push_back(buf_len);
00231   PutValue16(0); // temporary forward offset.
00232   int pos = Compile(0);
00233   if ((last_par.size()!=1)||(pos!=str_len))
00234   {
00235     error = pos;
00236     error_code = rge_unbalanced_round_bracket;
00237     error_str = CSRegExErrorStr[error_code];
00238     last_par.clear();
00239     return false;
00240   }
00241   if (error!=-1)
00242   {
00243     error_str = CSRegExErrorStr[error_code];
00244     last_par.clear();
00245     return false;
00246   }
00247   int parpos = last_par.back();
00248   int offset = buf_len-parpos;
00249   PUT_OFFSET(parpos, offset);
00250   last_par.pop_back();
00251   Put(rg_cpa);
00252   if (tal) Put(rg_tal);
00253   Put(rg_end);
00254   error_str = CSRegExErrorStr[0];
00255   last_par.clear();
00256   return true;
00257 }
00258 
00265 void CSRegEx::CheckString(int str_idx)
00266 {
00267   if (GetValue16(str_idx)==1)
00268   {
00269     buf[str_idx-1] = rg_chr;
00270     buf_len = str_idx;
00271     Put(buf[str_idx+CSREGEX_OFSWIDTH]);
00272   }
00273 }
00274 
00281 int CSRegEx::Compile(int pos)
00282 {
00283   // This is used to tell if a string has been started.
00284   // So further chars can just be appended if true,
00285   // or a new string is created if false.
00286   bool str_started = false;
00287   int str_idx = 0; // Regular expression current char position.
00288   int st_pos;
00289   while(pos<str_len)
00290   {
00291     _TUCHAR c = str[pos];
00292     pos++;
00293     if (buf_len>=65279) // 0xffff - 0x0100
00294     {
00295       error = pos-1;
00296       error_code = rge_regex_too_long;
00297       return pos-1;
00298     }
00299     bool esc = false;
00300     switch(c)
00301     {
00302       case _T('.'):
00303         if (str_started)
00304         {
00305           CheckString(str_idx); // Compress string if need be.
00306           str_started = false;
00307         }
00308         st_pos = buf_len; // Save location of this item.
00309         Put(rg_dot);
00310         pos=CompileModifiers(pos, st_pos);
00311         if (error!=-1) return pos;
00312         break;
00313       case _T('('):
00314         {
00315         if (str_started)
00316         {
00317           CheckString(str_idx);
00318           str_started = false;
00319         }
00320         st_pos = buf_len;
00321         // match a new expression.
00322         int old_pos;
00323         int type = 0;
00324         old_pos = pos-1;
00325         int opa_ofs;
00326         if ((pos+1<str_len)&&(str[pos]==_T('?'))&&(str[pos+1]==_T(':')))
00327         {
00328           Put(rg_opn);
00329           last_par.push_back(buf_len);
00330           PutValue16(0); // placeholder for next section.
00331           pos+=2;
00332         }
00333         else
00334         {
00335           opa_cnt++;
00336           opa_ofs = opa_cnt;
00337           if (opa_cnt==10)
00338           {
00339             error = pos;
00340             error_code = rge_too_many_refs;
00341             return old_pos;
00342           }
00343           Put(rg_opa+opa_cnt);
00344           last_par.push_back(buf_len);
00345           PutValue16(0); // placeholder for next section.
00346           type = 1;
00347         }
00348         pos = Compile(pos); // Recursive.
00349         if (error!=-1) return pos;
00350         if ((pos>=str_len)||(str[pos]!=_T(')')))
00351         {
00352           // unbalanced parentheses.
00353           error = old_pos;
00354           error = rge_missing_round_bracket;
00355           return old_pos;
00356         }
00357         pos++;
00358         int parpos = last_par.back();
00359         int offset = buf_len-parpos;
00360         PUT_OFFSET(parpos, offset);
00361         last_par.pop_back();
00362         Put(type?rg_cpa+opa_ofs:rg_cpn);
00363         pos=CompileModifiers(pos, st_pos);
00364         if (error!=-1) return pos;
00365         break;
00366         }
00367       case _T('['):
00368         if (str_started)
00369         {
00370           CheckString(str_idx);
00371           str_started = false;
00372         }
00373         st_pos = buf_len;
00374         pos = CompileRange(pos);
00375         if (error!=-1) return pos;
00376         pos = CompileModifiers(pos,st_pos);
00377         if (error!=-1) return pos;
00378         break;
00379       case _T('|'):
00380         {
00381           if (str_started)
00382           {
00383             CheckString(str_idx);
00384             str_started = false;
00385           }
00386           int parpos = last_par.back();
00387           int offset = buf_len-parpos;
00388           PUT_OFFSET(parpos, offset);
00389           Put(rg_orm);
00390           last_par.back()=buf_len;  // Update last offset.
00391           PutValue16(0);
00392           break;
00393         }
00394       case _T(')'):
00395         if (str_started)
00396         {
00397           CheckString(str_idx);
00398           str_started = false;
00399         }
00400         return pos-1;
00401       case _T('\\'):
00402         // This section falls through to the default section.
00403         // Unless it's a backreference.
00404         if ((str[pos]>=_T('1'))&&(str[pos]<=_T('9')))
00405         {
00406           if (str_started)
00407           {
00408             CheckString(str_idx);
00409             str_started = false;
00410           }
00411           st_pos = buf_len;
00412           Put(rg_bck+(str[pos++]-_T('0')));
00413           pos = CompileModifiers(pos,st_pos);
00414           if (error!=-1) return pos;
00415           break;
00416         }
00417         else if (str[pos]==_T('0'))
00418         {
00419           error = pos-1;
00420           error_code = rge_invalid_backreference;
00421           return pos-1;
00422         }
00423         pos = Escape(pos, c);
00424         if (error!=-1) return pos;
00425         esc = true;
00426       default:
00427         {
00428         if ((!esc)&&(c==_T('$'))&&(pos==str_len))
00429         {
00430           tal=true;
00431           break;
00432         }
00433         // Must check if next char is a modifier.
00434         // If it is, we output a CHR instead of continuing a STR.
00435         _TUCHAR cc;
00436         if ((pos<str_len)&&(((cc=str[pos])==_T('?'))||(cc==_T('*'))||(cc==_T('+'))))
00437         {
00438           if (str_started)
00439           {
00440             CheckString(str_idx);
00441             str_started = false;
00442           }
00443           st_pos = buf_len;
00444           Put(rg_chr);
00445           Put(c);
00446           pos = CompileModifiers(pos,st_pos);
00447           if (error!=-1) return pos;
00448           break;
00449         }
00450         if (str_started)
00451         {
00452           unsigned int cnt = GetValue16(str_idx);
00453           // Overflow can't happen here, so we don't check here.
00454           // It's checked at the top of the loop.
00455           Put(c);
00456           cnt++;
00457           PUT_OFFSET(str_idx, cnt);
00458         }
00459         else
00460         {
00461           Put(rg_str);
00462           str_idx = buf_len;
00463           PutValue16(1);
00464           Put(c);
00465           str_started = true;
00466         }
00467         }
00468     }
00469   }
00470   if (str_started)
00471   {
00472     CheckString(str_idx);
00473     str_started = false;
00474   }
00475   return pos;
00476 }
00477 
00486 int CSRegEx::Escape(int pos, _TUCHAR &c)
00487 {
00488   _TUCHAR c1 = str[pos];
00489   switch(c1)
00490   {
00491     case _T('n'):
00492       c = (_TUCHAR)10;
00493       break;
00494     case _T('r'):
00495       c = (_TUCHAR)13;
00496       break;
00497     case _T('t'):
00498       c = (_TUCHAR)9;
00499       break;
00500     case _T('a'):
00501       c = (_TUCHAR)7;
00502       break;
00503     case _T('b'):
00504       c = (_TUCHAR)8;
00505       break;
00506     case _T('f'):
00507       c = (_TUCHAR)12;
00508       break;
00509     case _T('v'):
00510       c = (_TUCHAR)11;
00511       break;
00512     case _T('x'):
00513     case _T('X'):
00514       // exactly 2 or 4 (UNICODE) digits.
00515 #ifdef _UNICODE
00516 #define ESC_CHARS 4
00517 #else
00518 #define ESC_CHARS 2
00519 #endif
00520       if (pos+ESC_CHARS>=str_len)
00521       {
00522         error = pos-1;
00523         error_code = rge_esc_eof;
00524         return pos;
00525       }
00526       c = 0;
00527       unsigned char h[ESC_CHARS];
00528       h[0] = _totlower(str[pos+1]);
00529       h[1] = _totlower(str[pos+2]);
00530 #ifdef _UNICODE
00531       h[2] = _totlower(str[pos+2]);
00532       h[3] = _totlower(str[pos+3]);
00533 #endif
00534       for(int k=0;k<ESC_CHARS;k++)
00535       {
00536         c<<=4;
00537         if (_istdigit(h[k]))
00538         {
00539           c += h[k]-_T('0');
00540         }
00541         else if ((h[k]>='a')&&(h[k]<='f'))
00542         {
00543           c += h[k]-_T('a')+(_TUCHAR)10;
00544         }
00545         else
00546         {
00547           error = pos+k+1;
00548           error_code = rge_invalid_esc_hex;
00549           return pos;
00550         }
00551       }
00552       pos+=ESC_CHARS;
00553       break;
00554     default:
00555       c=c1;
00556   }
00557   pos++;
00558   return pos;
00559 }
00560 
00568 int CSRegEx::CompileModifiers(int pos, int st_pos)
00569 {
00570   if (pos>=str_len) return pos;
00571   int old_pos = pos;
00572   int buf_idx = buf_len;
00573   _TUCHAR c = str[pos];
00574   switch(c)
00575   {
00576     case _T('?'):
00577       Put(rg_qes);
00578       PutValue16(buf_len-st_pos);
00579       pos++;
00580       break;
00581     case _T('*'):
00582       Put(rg_sta);
00583       PutValue16(buf_len-st_pos);
00584       pos++;
00585       break;
00586     case _T('+'):
00587       Put(rg_pls);
00588       PutValue16(buf_len-st_pos);
00589       pos++;
00590       break;
00591     case _T('{'):
00592       // Get the range.
00593       // Possibilities:
00594       // {start,end}  // start-end
00595       // {start,}     // start-inf
00596       // {start}      // exactly start times.
00597       int start=0;
00598       int end = 0;
00599       bool inf = false;
00600       pos++;
00601       if (!_istdigit(str[pos]))
00602       {
00603         error = pos;
00604         error_code = rge_invalid_repeat_format;
00605         return pos;
00606       }
00607       do {start=start*10+static_cast<int>(str[pos++]-_T('0'));}
00608       while(_istdigit(str[pos]));
00609       if (str[pos]==_T('}'))
00610       {
00611         // exact {start}
00612         end = start;
00613         pos++;
00614       }
00615       else
00616       {
00617         if (str[pos]!=_T(','))
00618         {
00619           error = pos;
00620           error_code = rge_invalid_repeat_format;
00621           return pos;
00622         }
00623         pos++;
00624         if (str[pos]==_T('}'))
00625         {
00626           // {start,}
00627           inf = true;
00628           end = 0;
00629           pos++;
00630         }
00631         else
00632         {
00633           if (!_istdigit(str[pos]))
00634           {
00635             error = pos;
00636             error_code = rge_invalid_repeat_format;
00637             return pos;
00638           }
00639           do {end=end*10+static_cast<int>(str[pos++]-_T('0'));}
00640           while(_istdigit(str[pos]));
00641           if (str[pos]!=_T('}'))
00642           {
00643             error = pos;
00644             error_code = rge_invalid_repeat_format;
00645             return pos;
00646           }
00647           // {start,end}
00648           pos++;
00649         }
00650       }
00651       if ((start==0)&&(end==1))
00652       {
00653         // same as ?
00654         Put(rg_qes);
00655         PutValue16(buf_len-st_pos);
00656       }
00657       else if ((start==0)&&(inf))
00658       {
00659         // same as *
00660         Put(rg_sta);
00661         PutValue16(buf_len-st_pos);
00662       }
00663       else if ((start==1)&&(inf))
00664       {
00665         // same as +
00666         Put(rg_pls);
00667         PutValue16(buf_len-st_pos);
00668       }
00669       else if (((start==0)&&(end==0))||(start>end)||(start>65277)||(end>65277)||((!inf)&&(end==0)))
00670       {
00671         // Invalid ranges.
00672         error = old_pos;
00673         error_code = rge_invalid_repeat_range;
00674         return pos;
00675       }
00676       else
00677       {
00678         Put(rg_rpt);
00679         PutValue16(buf_len-st_pos);
00680         PutValue16(start+1);
00681         PutValue16((inf)?65279:end+1);
00682       }
00683   }
00684   // If the above doesn't match, neither will this, so it's ok.
00685   if (str[pos]==_T('?'))
00686   {
00687     // lazy.
00688     RGTok t = static_cast<RGTok>(buf[buf_idx]);
00689     RGTok tt = static_cast<RGTok>(0);
00690     if (t==rg_qes)
00691       tt=rg_qes_lazy;
00692     else if (t==rg_sta)
00693       tt=rg_sta_lazy;
00694     else if (t==rg_pls)
00695       tt=rg_pls_lazy;
00696     else if (t==rg_rpt)
00697       tt=rg_rpt_lazy;
00698     if (tt)
00699     {
00700       buf[buf_idx] = tt;
00701       pos++;
00702     }
00703   }
00704   return pos;
00705 }
00706 
00717 bool CSRegEx::RangeAddChar(vector<_TUCHAR> &chars, vector<_TUCHAR> &start, vector<_TUCHAR> &end, _TUCHAR c)
00718 {
00719 #ifdef _UNICODE
00720   unsigned int xc = static_cast<unsigned int>(c);
00721 #else
00722 #define xc c
00723 #define xi *i
00724 #define xj *j
00725 #endif
00726   // could use map to speed up searches, but there's never
00727   // enough chars to really make a difference.
00728   for(vector<_TUCHAR>::iterator i=chars.begin();i!=chars.end();i++)
00729   {
00730 #ifdef _UNICODE
00731     unsigned int xi = static_cast<unsigned int>(*i);
00732 #endif
00733     if (xc==xi)
00734     {
00735       error_code = rge_overlapping_chars;
00736       return false;
00737     }
00738   }
00739   bool added = false;
00740   vector<_TUCHAR>::iterator i = start.begin();
00741   vector<_TUCHAR>::iterator j = end.begin();
00742   for(;i!=start.end();i++,j++)
00743   {
00744 #ifdef _UNICODE
00745     unsigned int xi = static_cast<unsigned int>(*i);
00746     unsigned int xj = static_cast<unsigned int>(*j);
00747 #endif
00748     if ((xc>=xi)&&(xc<=xj))
00749     {
00750       error_code = rge_overlapping_chars;
00751       return false;
00752     }
00753     if ((!added)&&(xc==xi-1))
00754     {
00755       *i = c;
00756       added = true;
00757     }
00758     else if ((!added)&&(xc==xj+1))
00759     {
00760       *j = c;
00761       added = true;
00762     }
00763   }
00764   if (!added) chars.push_back(c);
00765   return true;
00766 #ifndef _UNICODE
00767 #undef xc
00768 #undef xi
00769 #undef xj
00770 #endif
00771 }
00772 
00784 bool CSRegEx::RangeAddRange(vector<_TUCHAR> &chars, vector<_TUCHAR> &start, vector<_TUCHAR> &end, _TUCHAR cstart, _TUCHAR cend)
00785 {
00786 #ifdef _UNICODE
00787   unsigned int xcstart = static_cast<unsigned int>(cstart);
00788   unsigned int xcend = static_cast<unsigned int>(cend);
00789 #else
00790 #define xcstart cstart
00791 #define xcend cend
00792 #define xi *i
00793 #define xj *j
00794 #endif
00795 
00796   for(vector<_TUCHAR>::iterator i=chars.begin();i!=chars.end();i++)
00797   {
00798 #ifdef _UNICODE
00799     unsigned int xi = static_cast<unsigned int>(*i);
00800 #endif
00801     if ((xi>=xcstart)&&(xi<=xcend))
00802     {
00803       error_code = rge_overlapping_chars;
00804       return false;
00805     }
00806   }
00807   bool added = false;
00808   vector<_TUCHAR>::iterator i = start.begin();
00809   vector<_TUCHAR>::iterator j = end.begin();
00810   for(;i!=start.end();i++,j++)
00811   {
00812 #ifdef _UNICODE
00813     unsigned int xi = static_cast<unsigned int>(*i);
00814     unsigned int xj = static_cast<unsigned int>(*j);
00815 #endif
00816     if (((xcstart>=xi)&&(xcstart<=xj))||((xcend>=xi)&&(xcend<=xj)))
00817     {
00818       error_code = rge_overlapping_chars;
00819       return false;
00820     }
00821     if ((!added)&&(xcend==xi-1))
00822     {
00823       *i = cstart;
00824       added = true;
00825     }
00826     else if ((!added)&&(xcstart==xj+1))
00827     {
00828       *j = cend;
00829       added = true;
00830     }
00831   }
00832   if (!added)
00833   {
00834     start.push_back(cstart);
00835     end.push_back(cend);
00836   }
00837   return true;
00838 #ifndef _UNICODE
00839 #undef xcstart
00840 #undef xcend
00841 #undef xi
00842 #undef xj
00843 #endif
00844 }
00845 
00855 void CSRegEx::CompressRange(vector<_TUCHAR> &chars, vector<_TUCHAR> &start, vector<_TUCHAR> &end)
00856 {
00857 #ifndef _UNICODE
00858 #define xc c
00859 #define xc1 c1
00860 #define xst st
00861 #define xen en
00862 #define xs1 s1
00863 #define xe1 e1
00864 #define xs2 s2
00865 #define xe2 e2
00866 #endif
00867   // create ranges with chars if possible.
00868   for(unsigned int i=0;i<chars.size();i++)
00869   {
00870     _TUCHAR c = chars[i];
00871 #ifdef _UNICODE
00872     unsigned int xc = static_cast<unsigned int>(c);
00873 #endif
00874     for (unsigned int j=i+1;j<chars.size();j++)
00875     {
00876       _TUCHAR c1 = chars[j];
00877 #ifdef _UNICODE
00878       unsigned int xc1 = static_cast<unsigned int>(c1);
00879 #endif
00880       if (xc+1==xc1)
00881       {
00882         start.push_back(c);
00883         end.push_back(c1);
00884         // j is always larger, so we have to erase that one first.
00885         chars.erase(chars.begin()+j);
00886         chars.erase(chars.begin()+i);
00887         i--;
00888         break;
00889       }
00890       else if (xc1+1==xc)
00891       {
00892         start.push_back(c1);
00893         end.push_back(c);
00894         chars.erase(chars.begin()+j);
00895         chars.erase(chars.begin()+i);
00896         i--;
00897         break;
00898       }
00899     }
00900   }
00901 
00902   // merge chars into ranges.
00903   for(unsigned int i=0;i<chars.size();i++)
00904   {
00905     _TUCHAR c = chars[i];
00906 #ifdef _UNICODE
00907     unsigned int xc = static_cast<unsigned int>(c);
00908 #endif
00909     for(unsigned int j=0;j<start.size();j++)
00910     {
00911       _TUCHAR st = start[j];
00912       _TUCHAR en = end[j];
00913 #ifdef _UNICODE
00914       unsigned int xst = static_cast<unsigned int>(st);
00915       unsigned int xen = static_cast<unsigned int>(en);
00916 #endif
00917       if (xc==xst-1)
00918       {
00919         start[j] = c;
00920         chars.erase(chars.begin()+i);
00921         i--;
00922       }
00923       else if (xc==xen+1)
00924       {
00925         end[j] = c;
00926         chars.erase(chars.begin()+i);
00927         i--;
00928       }
00929     }
00930   }
00931 
00932   // Merge ranges.
00933   for(unsigned int i=0;i<start.size();i++)
00934   {
00935     _TUCHAR s1 = start[i];
00936     _TUCHAR e1 = end[i];
00937 #ifdef _UNICODE
00938     unsigned int xs1 = static_cast<unsigned int>(s1);
00939     unsigned int xe1 = static_cast<unsigned int>(e1);
00940 #endif
00941     bool found = false;
00942     for(unsigned int j=i+1;j<end.size();j++)
00943     {
00944       _TUCHAR s2 = start[j];
00945       _TUCHAR e2 = end[j];
00946 #ifdef _UNICODE
00947       unsigned int xs2 = static_cast<unsigned int>(s2);
00948       unsigned int xe2 = static_cast<unsigned int>(e2);
00949 #endif
00950       if (xe2+1==xs1) // goes before
00951       {
00952         start[i] = s2;
00953         start.erase(start.begin()+j);
00954         end.erase(end.begin()+j);
00955         j--;
00956         found = true;
00957       }
00958       else if (xs2-1==xe1) // goes after
00959       {
00960         end[i] = e2;
00961         start.erase(start.begin()+j);
00962         end.erase(end.begin()+j);
00963         j--;
00964         found = true;
00965       }
00966     }
00967     if (found) i--; // Gotta scan this range again.
00968   }
00969 #ifndef _UNICODE
00970 #undef xc
00971 #undef xc1
00972 #undef xst
00973 #undef xen
00974 #undef xs1
00975 #undef xe1
00976 #undef xs2
00977 #undef xe2
00978 #endif
00979 }
00980 
00988 int CSRegEx::CompileRange(int pos)
00989 {
00990   vector<_TUCHAR> chars;
00991   vector<_TUCHAR> start;
00992   vector<_TUCHAR> end;
00993   // first char
00994   int old_pos = pos-1;
00995   int err_pos = pos;
00996   if (pos>=str_len)
00997   {
00998     error = old_pos;
00999     error_code = rge_missing_square_bracket;
01000     return old_pos;
01001   }
01002   _TUCHAR c = str[pos];
01003   bool neg = false;
01004   if (c==_T('^'))
01005   {
01006     neg = true;
01007     pos++;
01008     if (pos>=str_len)
01009     {
01010       error = pos-1;
01011       error_code = rge_missing_square_bracket;
01012       return old_pos;
01013     }
01014     err_pos = pos;
01015     c = str[pos];
01016   }
01017   pos++;
01018   if (c==_T('\\'))
01019   {
01020     pos = Escape(pos, c);
01021     if (error!=-1) return pos;
01022   }
01023   if (pos>=str_len)
01024   {
01025     error = old_pos;
01026     error_code = rge_missing_square_bracket;
01027     return old_pos;
01028   }
01029   for(;;)
01030   {
01031     bool range=false;
01032     if (pos+1<str_len)
01033     {
01034       int pos1 = pos;
01035       // check for range.
01036       _TUCHAR c1 = str[pos1++];
01037       _TUCHAR c2=static_cast<_TUCHAR>(0);
01038       if ((c1==_T('-'))&&(pos1<str_len))
01039       {
01040         c2 = str[pos1++];
01041         if (c2==_T('\\'))
01042         {
01043           pos1 = Escape(pos1, c2);
01044           if (error!=-1) return pos1;
01045         }
01046         if (c2!=_T(']'))
01047         {
01048 #ifdef _UNICODE
01049           if (static_cast<unsigned int>(c)>=static_cast<unsigned int>(c2))
01050 #else
01051           if (c>=c2)
01052 #endif
01053           {
01054             error = err_pos;
01055             error_code = rge_invalid_range;
01056             return pos1;
01057           }
01058           pos = pos1;
01059           bool b = RangeAddRange(chars,start,end,c,c2);
01060           if (!b)
01061           {
01062             error = err_pos;
01063             return pos;
01064           }
01065           range = true;
01066         }
01067       }
01068     }
01069     if (!range)
01070     {
01071       bool b = RangeAddChar(chars, start, end, c);
01072       if (!b)
01073       {
01074         error = err_pos;
01075         return pos;
01076       }
01077     }
01078     // Get next char.
01079     if (pos>=str_len)
01080     {
01081       error = old_pos;
01082       error_code = rge_missing_square_bracket;
01083       return pos;
01084     }
01085     err_pos = pos;
01086     c = str[pos++];
01087     if (c==_T(']')) break;
01088     if (c==_T('\\'))
01089     {
01090       pos = Escape(pos, c);
01091       if (error!=-1) return pos;
01092     }
01093   }
01094   CompressRange(chars,start,end);
01095   // Output the data.
01096   Put(neg?rg_grn:rg_grp);
01097   PutValue16(chars.size()+1); // We can't have 0 as a char, so add 1
01098   PutValue16(start.size()+1);
01099   for(unsigned int i=0;i<chars.size();i++)
01100   {
01101     Put(chars[i]);
01102   }
01103   for(unsigned int i=0;i<start.size();i++)
01104   {
01105     Put(start[i]);
01106     Put(end[i]);
01107   }
01108   return pos;
01109 }
01110 
01128 unsigned int CSRegEx::GetRpt(unsigned int buf_idx, int &low, int &high, bool &lazy, unsigned int &prev) const
01129 {
01130   RGTok t = static_cast<RGTok>(buf[buf_idx]);
01131   lazy = false;
01132   switch(t)
01133   {
01134     case rg_qes_lazy:
01135       lazy = true;
01136     case rg_qes:
01137       low = 0;
01138       high = 1;
01139       prev = buf_idx+1-GetValue16(buf_idx+1);
01140       return buf_idx+1+CSREGEX_OFSWIDTH;
01141     case rg_sta_lazy:
01142       lazy = true;
01143     case rg_sta:
01144       low = 0;
01145       high = 0x7fffffff;
01146       prev = buf_idx+1-GetValue16(buf_idx+1);
01147       return buf_idx+1+CSREGEX_OFSWIDTH;
01148     case rg_pls_lazy:
01149       lazy = true;
01150     case rg_pls:
01151       low = 1;
01152       high = 0x7fffffff;
01153       prev = buf_idx+1-GetValue16(buf_idx+1);
01154       return buf_idx+1+CSREGEX_OFSWIDTH;
01155     case rg_rpt_lazy:
01156       lazy = true;
01157     case rg_rpt:
01158       low = GetValue16(buf_idx+1+CSREGEX_OFSWIDTH)-1;
01159       high = GetValue16(buf_idx+1+2*CSREGEX_OFSWIDTH)-1;
01160       if (high==65534) high = 0x7fffffff;
01161       prev = buf_idx+1-GetValue16(buf_idx+1);
01162       return buf_idx+1+3*CSREGEX_OFSWIDTH;
01163     default:
01164       low=1;
01165       high=1;
01166       lazy=false;
01167       return buf_idx;
01168   }
01169 }
01170 
01188 bool CSRegEx::MatchRE(const _TUCHAR *str, const _TUCHAR *re)
01189 {
01190   if (!Compile(re)) return false;
01191   return Match(str);
01192 }
01193 
01194 #ifndef _UNICODE
01195 bool CSRegEx::MatchRE(const char *str, const char *re)
01196 {
01197   return MatchRE((const unsigned char*)str,(const unsigned char*)re);
01198 }
01199 
01200 bool CSRegEx::MatchRE(const unsigned char *str, const char *re)
01201 {
01202   return MatchRE(str,(const unsigned char*)re);
01203 }
01204 
01205 bool CSRegEx::MatchRE(const char *str, const unsigned char *re)
01206 {
01207   return MatchRE((const unsigned char*)str,re);
01208 }
01209 
01210 bool CSRegEx::Match(const unsigned char *str, const char *cmp)
01211 {
01212   return Match(str,(const unsigned char*)cmp);
01213 }
01214 
01215 bool CSRegEx::Match(const char *str, const unsigned char *cmp)
01216 {
01217   return Match((const unsigned char*)str,cmp);
01218 }
01219 
01220 bool CSRegEx::Match(const char *str, const char *cmp)
01221 {
01222   return Match((const unsigned char*)str,(const unsigned char*)cmp);
01223 }
01224 #endif
01225 
01238 bool CSRegEx::Match(const _TUCHAR *str, const _TUCHAR *cmp)
01239 {
01240   _TUCHAR *buf1 = buf;
01241   // We're removing the const here, but we don't actually
01242   // modify the string.  It's just so we don't have to
01243   // allocate another non-const variable.
01244   // The compile function does modify the 'buf' member,
01245   // but we don't call it and we remove 'cmp' before
01246   // we return, so no problem.
01247   buf = const_cast<_TUCHAR*>(cmp);
01248   bool b = Match(str);
01249   buf = buf1;
01250   return b;
01251 }
01252 
01253 #ifndef _UNICODE
01254 bool CSRegEx::Match(const char *str)
01255 {
01256   return Match((const unsigned char*)str);
01257 }
01258 #endif
01259 
01276 bool CSRegEx::Match(const _TUCHAR *str)
01277 {
01278   int max=1;
01279   if ((buf[0]!=rg_hed)&&(!bMatchHead)) max = _tcslen(reinterpret_cast<const _TCHAR*>(str));
01280   // TODO: Make a list of FIRST chars and scan for that instead.
01281   for(int i=0;i<max;i++)
01282   {
01283     if (Match1(str+i))
01284     {
01285       MatchStart += i;
01286       MatchEnd += i;
01287       // Cleanup
01288       for(int j=0;j<10;j++)
01289       {
01290         if (match_start[j].size()!=0)
01291         {
01292           BackStart[j] = match_start[j].back()+i;
01293           BackEnd[j] = match_end[j].back()+i;
01294         }
01295         else
01296         {
01297           BackStart[j] = 0;
01298           BackEnd[j] = 0;
01299         }
01300         match_start[j].clear();
01301         match_end[j].clear();
01302       }
01303       par.clear();
01304       return true;
01305     }
01306   }
01307   // Cleanup
01308   for(int i=0;i<10;i++)
01309   {
01310     match_start[i].clear();
01311     match_end[i].clear();
01312   }
01313   par.clear();
01314   return false;
01315 }
01316 
01317 // This checks the results of basic matching.
01318 
01319 #define CHECK_MATCH() \
01320   if (cnt>=low) /* We got the minimum required.*/ \
01321   { \
01322     /* success. */ \
01323     /* if this is lazy, then we have to keep a ref */ \
01324     /* to the empty match.*/ \
01325     if ((lazy)&&(cnt==0)) \
01326     { \
01327       m->rpt = -1; \
01328       idx++; \
01329       if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];} \
01330       m1 = &matches[idx]; \
01331       m1->start = m->end;  \
01332       m = m1; \
01333     } \
01334     m->buf_idx = next; \
01335     m->rpt = 0; \
01336   } \
01337   else \
01338   { \
01339     /* failure. */ \
01340     if (cnt==-1) cnt = min; \
01341     idx-=(cnt+1); \
01342     state=1; /* backtrack error. */ \
01343   } 
01344 
01345 
01356 bool CSRegEx::Match1(const _TUCHAR *str)
01357 {
01358   unsigned int par_index=-1;
01359   int idx=0;
01360   int matches_size = 1024;
01361   bool backtrack=false; // Used for lazy backtracking.  
01362 
01363   // Setup
01364   this->str = str;
01365 
01366   matches.resize(matches_size);
01367   matches[0].buf_idx=0;
01368   matches[0].start = 0;
01369   matches[0].rpt = 0;
01370 
01371   if (buf[0]==rg_hed) matches[0].buf_idx++;
01372 
01373   int state = 0;  // 0=forward, 1=backtrack.
01374 
01375   CSMatch *m1;
01376 
01377   while(idx!=-1)
01378   {
01379     CSMatch *m = &matches[idx];
01380     RGTok t = static_cast<RGTok>(buf[m->buf_idx]);
01381     RGTok tt;
01382 
01383     unsigned int rpt_idx;
01384     int low, high; // must be signed.  repeat ranges.
01385     bool lazy;
01386     unsigned int next, prev; // Next and previous item positions.
01387 
01388     if (state==0)
01389     {
01390       m->end = m->start;
01391 
01392       switch(t)
01393       {
01394         case rg_tal:
01395           // If we find end of line, don't advance m->end counter.
01396           if (str[m->start]!=_T('\0'))
01397           {
01398             state = 1;
01399             idx--;
01400             break;
01401           }
01402           idx++;
01403           if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01404           m1 = &matches[idx];
01405           m1->buf_idx = m->buf_idx+1;
01406           m1->start = m->end;
01407           m1->rpt = 0;
01408           break;
01409         case rg_chr:
01410           {
01411             // Get the repeat char, if any.
01412             rpt_idx = m->buf_idx+2;
01413             next = GetRpt(rpt_idx,low,high,lazy, prev);
01414 
01415             int cnt=(backtrack)?-1:0;
01416             unsigned int min=(backtrack)?m->rpt:0;
01417             unsigned int max=(lazy)?((backtrack)?m->rpt+1:low):high;
01418             for(unsigned int i=min;i<max;i++)
01419             {
01420               // Match 1 character.
01421               if (str[m->start]!=buf[m->buf_idx+1]) break;
01422               cnt=i+1;
01423               m->end++;
01424               idx++;
01425               if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01426               m1 = &matches[idx];
01427               m1->buf_idx = m->buf_idx;
01428               m1->start = m->end;
01429               m1->end = m1->start;
01430               m1->rpt = m->rpt+1;
01431               m = m1;
01432             }
01433             CHECK_MATCH();
01434             break;
01435           }
01436         case rg_str:
01437           {
01438             // rg_str can't be repeated, so this is simple.
01439             unsigned int cnt = GetValue16(m->buf_idx+1);
01440             for(unsigned int i=0;i<cnt;i++)
01441             {
01442               if (str[m->end++]!=buf[m->buf_idx+1+CSREGEX_OFSWIDTH+i])
01443               {
01444                 state = 1; // backtrack error.
01445                 idx--;
01446                 break;
01447               }
01448             }
01449             if (state==1) break;
01450             idx++;
01451             if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01452             m1 = &matches[idx];
01453             m1->buf_idx = m->buf_idx+1+CSREGEX_OFSWIDTH+cnt;
01454             m1->start = m->end;
01455             m1->rpt = 0;
01456             break;
01457           }
01458         case rg_dot:
01459           {
01460             rpt_idx = m->buf_idx+1;
01461             next = GetRpt(rpt_idx,low,high,lazy,prev);
01462 
01463             int cnt=(backtrack)?-1:0;
01464             unsigned int min=(backtrack)?m->rpt:0;
01465             unsigned int max=(lazy)?((backtrack)?m->rpt+1:low):high;
01466             for(unsigned int i=min;i<max;i++)
01467             {
01468               if (str[m->start]==_T('\0')) break;
01469               cnt=i+1;
01470               m->end++;
01471               idx++;
01472               if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01473               m1 = &matches[idx];
01474               m1->buf_idx = m->buf_idx;
01475               m1->start = m->end;
01476               m1->end = m1->start;
01477               m1->rpt = m->rpt+1;
01478               m = m1;
01479             }
01480             CHECK_MATCH();
01481             break;
01482           }
01483         case rg_bck:
01484         case rg_bck1: case rg_bck2: case rg_bck3:
01485         case rg_bck4: case rg_bck5: case rg_bck6:
01486         case rg_bck7: case rg_bck8: case rg_bck9:
01487           {
01488             // match backreference.
01489             rpt_idx = m->buf_idx+1;
01490             next = GetRpt(rpt_idx,low,high,lazy,prev);
01491 
01492             int cnt=(backtrack)?-1:0;
01493             unsigned int min=(backtrack)?m->rpt:0;
01494             unsigned int max=(lazy)?((backtrack)?m->rpt+1:low):high;
01495             unsigned int back_idx = t-rg_bck;
01496             unsigned int start = match_start[back_idx].back();
01497             unsigned int end = match_end[back_idx].back();
01498             if (start==end) max = low; // For empty backreferences.  Has to match minimum, so we'll have redundant entries.
01499             for(unsigned int i=min;i<max;i++)
01500             {
01501               bool bok = true;
01502               for(unsigned int j=start;j<end;j++)
01503               {
01504                 if (str[j]!=str[m->end++])
01505                 {
01506                   bok = false;
01507                   break;
01508                 }
01509               }
01510               if (!bok) break;
01511               cnt=i+1;
01512               idx++;
01513               if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01514               m1 = &matches[idx];
01515               m1->buf_idx = m->buf_idx;
01516               m1->start = m->end;
01517               m1->end = m1->start;
01518               m1->rpt = m->rpt+1;
01519               m = m1;
01520             }
01521             CHECK_MATCH();
01522             break;
01523           }
01524         case rg_grp:
01525         case rg_grn:
01526           {
01527             // Check group of chars and ranges.
01528             unsigned int char_cnt = GetValue16(m->buf_idx+1)-1;
01529             unsigned int rng_cnt = GetValue16(m->buf_idx+1+CSREGEX_OFSWIDTH)-1;
01530             _TUCHAR c = str[m->start];
01531 
01532             rpt_idx = m->buf_idx+1+2*CSREGEX_OFSWIDTH+char_cnt+2*rng_cnt;
01533             next = GetRpt(rpt_idx,low,high,lazy,prev);
01534 
01535             int cnt=(backtrack)?-1:0;
01536             unsigned int min=(backtrack)?m->rpt:0;
01537             unsigned int max=(lazy)?((backtrack)?m->rpt+1:low):high;
01538 
01539             if (c!=_T('\0'))
01540             {
01541               for(unsigned int i=min;i<max;i++)
01542               {
01543                 c = str[m->start];
01544                 if (c==_T('\0')) break;
01545                 bool found = false;
01546                 unsigned int pos = m->buf_idx+1+2*CSREGEX_OFSWIDTH;
01547                 for(unsigned int j=0;j<char_cnt;j++)
01548                 {
01549                   if (c==buf[pos++])
01550                   {
01551                     found = true;
01552                     break;
01553                   }
01554                 }
01555                 if (!found)
01556                 {
01557 #ifdef _UNICODE
01558                   unsigned int xc = static_cast<unsigned int>(c);
01559 #else
01560 #define xc c
01561 #define xlow1 low1
01562 #define xhigh1 high1
01563 #endif
01564                   for(unsigned int j=0;j<rng_cnt;j++)
01565                   {
01566                     _TUCHAR low1 = buf[pos++];
01567                     _TUCHAR high1 = buf[pos++];
01568 #ifdef _UNICODE
01569                     unsigned int xlow1 = static_cast<unsigned int>(low1);
01570                     unsigned int xhigh1 = static_cast<unsigned int>(high1);
01571 #endif
01572                     if ((xc>=xlow1)&&(xc<=xhigh1))
01573                     {
01574                       found = true;
01575                       break;
01576                     }
01577                   }
01578                 }
01579 #ifndef _UNICODE
01580 #undef xc
01581 #undef xlow1
01582 #undef xhigh1
01583 #endif
01584                 //
01585                 bool b = (t==rg_grn);
01586                 if (found==b) break;
01587                 cnt=i+1;
01588                 idx++;
01589                 if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01590                 m1 = &matches[idx];
01591                 m1->buf_idx = m->buf_idx;
01592                 m->end++;
01593                 m1->start = m->end;
01594                 m1->end = m1->start;
01595                 m1->rpt = m->rpt+1;
01596                 m = m1;
01597               }
01598             }
01599             CHECK_MATCH();
01600             break;
01601           }
01602         case rg_opa:
01603         case rg_opa1: case rg_opa2: case rg_opa3:
01604         case rg_opa4: case rg_opa5: case rg_opa6:
01605         case rg_opa7: case rg_opa8: case rg_opa9:
01606         case rg_opn:
01607           {
01608             par_index++; // Get ourselves a new par index.
01609             if (par_index+1>par.size()) par.resize(par_index+1);
01610             par[par_index].match_idx = idx;
01611             par[par_index].buf_idx = m->buf_idx;
01612             // start and end will be updated later.
01613             // so just setup the next items.
01614             idx++;
01615             if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01616             m1 = &matches[idx];
01617             m1->buf_idx = m->buf_idx+1+CSREGEX_OFSWIDTH;
01618             m1->start = m->end;
01619             m1->rpt = 0;
01620             // Gotta check for greediness.
01621             // If this is the first item and it's lazy, then skip to the end.
01622             if (m->rpt==0)
01623             {
01624               unsigned int pos = m->buf_idx+1+GetValue16(m->buf_idx+1);
01625               tt = static_cast<RGTok>(buf[pos]);
01626               while(tt==rg_orm)
01627               {
01628                 pos += GetValue16(pos+1)+1;
01629                 tt = static_cast<RGTok>(buf[pos]);
01630               }
01631               pos++;
01632               int n = GetRpt(pos,low,high,lazy,prev);
01633               if ((lazy)&&(low==0))
01634               {
01635                 // empty entry is ok.
01636                 m->end = m->start;
01637                 m->rpt = -1;
01638                 idx++;
01639                 if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01640                 m1 = &matches[idx];
01641                 m1->buf_idx = n;
01642                 m1->start = m->buf_idx;
01643                 m1->end = par[par_index].match_idx;
01644                 tt = (RGTok)buf[m->buf_idx];
01645                 if ((tt>=rg_opa)&&(tt<=rg_opa9))
01646                 {
01647                   unsigned int k = tt-rg_opa;
01648                   match_start[k].push_back(m->start);
01649                   match_end[k].push_back(m->end);
01650                 }
01651                 par_index--;
01652                 m1->rpt = -1;
01653                 idx++;
01654                 if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-2];}
01655                 CSMatch *m2 = &matches[idx];
01656                 m2->buf_idx = n;
01657                 m2->start = m->start;
01658                 m2->rpt = 0;
01659               }
01660             }
01661             break;
01662           }
01663         case rg_cpa:
01664         case rg_cpa1: case rg_cpa2: case rg_cpa3:
01665         case rg_cpa4: case rg_cpa5: case rg_cpa6:
01666         case rg_cpa7: case rg_cpa8: case rg_cpa9:
01667         case rg_cpn:
01668           {
01669             int k = par[par_index].match_idx;
01670             matches[k].end = m->start;
01671             m->rpt = matches[k].rpt;
01672             m->start = par[par_index].buf_idx;
01673             m->end = k;
01674             if (t!=rg_cpn)
01675             {
01676               int k1 = t-rg_cpa;
01677               match_start[k1].push_back(matches[k].start);
01678               match_end[k1].push_back(matches[k].end);
01679             }
01680             idx++;
01681             if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01682             m1 = &matches[idx];
01683             m1->buf_idx = m->buf_idx+1;
01684             m1->start = matches[k].end;
01685             m1->rpt = 0;
01686             par_index--;
01687             break;
01688           }
01689         case rg_orm:
01690           // success on matching par, skip ahead.
01691           {
01692             unsigned int ofs = GetValue16(m->buf_idx+1);
01693             unsigned int pos = m->buf_idx+1+ofs;
01694             tt = static_cast<RGTok>(buf[pos]);
01695             while(tt==rg_orm)
01696             {
01697               ofs = GetValue16(pos+1);
01698               pos += ofs+1;
01699               tt = static_cast<RGTok>(buf[pos]);
01700             }
01701             m->buf_idx=pos;
01702             break;
01703           }
01704         case rg_pls:
01705         case rg_sta:
01706         case rg_qes:
01707         case rg_rpt:
01708         case rg_pls_lazy:
01709         case rg_sta_lazy:
01710         case rg_qes_lazy:
01711         case rg_rpt_lazy:
01712           {
01713             // This will only get here for round brackets.
01714             next = GetRpt(m->buf_idx,low,high,lazy,prev);
01715             CSMatch *m2 = &matches[idx-1];
01716             m1 = &matches[m2->end];
01717             int rpt = m2->rpt+1;
01718             if ((rpt==high)||((lazy)&&(rpt>=low)))
01719             {
01720               // continue to next item.
01721               m->buf_idx = next;
01722               break;
01723             }
01724             else if (rpt==0)
01725             {
01726               // matches 0 times, but that's not in the range.
01727               // ex: matches 0 times, but we have a '+'
01728               idx--;
01729               state = 1;
01730               break;
01731             }
01732             // Try this same one again.
01733             m->buf_idx = prev;
01734             m->rpt = m1->rpt+1;
01735             break;
01736           }
01737         case rg_end:
01738           // Success
01739           MatchStart = match_start[0].back();
01740           MatchEnd = match_end[0].back();
01741           return true;
01742       }
01743       backtrack = false;
01744     }
01745     else
01746     {
01747       // backtrack.
01748       // Check if repeat is lazy or not.
01749       // lazy
01750       // if it's a ), then remove match_start and match_end if necessary. par_index++
01751       // if it's a (, then check if we can advance to next | item.  If it's ), continue backtracking as if normal item.
01752       // if rpt < low, keep backtracking.
01753       // if rpt is zero and {empty} is allowed, convert this item or par set to rpt = -1.
01754       // otherwise create next item and we're done.
01755       unsigned int next = Next(m->buf_idx);
01756       unsigned int n = GetRpt(next,low,high,lazy,prev);
01757       tt = static_cast<RGTok>(buf[m->buf_idx]);
01758       // )
01759       if ((tt==rg_cpn)||((tt>=rg_cpa)&&(tt<=rg_cpa9)))
01760       {
01761         if ((lazy)&&(m->rpt+1<high))
01762         {
01763           if (m->rpt==-1)
01764           {
01765             // Gotta expand from empty match to single match.
01766             idx--;
01767             m = &matches[idx];
01768             m->rpt = 0; // This is all that's needed to do that.
01769           }
01770           else
01771           {
01772             // Lazy expands, so repeat another set of round brackets.
01773             idx++;
01774             if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01775             m1 = &matches[idx];
01776             CSMatch *m2 = &matches[m->end];
01777             m1->buf_idx = prev;
01778             m1->start = m2->end;
01779             m1->rpt = m2->rpt+1;
01780           }
01781           state = 0;
01782         }
01783         else
01784         {
01785           // Not lazy.  So backtrack normal.
01786           // Gotta update par indexes.
01787           if (m->rpt==-1)
01788           {
01789             // empty set.
01790             // remove last 2 items.
01791             idx-=2;
01792           }
01793           else
01794           {
01795             par_index++;
01796             if (par_index+1>par.size()) par.resize(par_index);
01797             par[par_index].buf_idx = m->start;
01798             par[par_index].match_idx = m->end;
01799             idx--;
01800           }
01801           if (tt!=rg_cpn)
01802           {
01803             int k = tt-rg_cpa;
01804             match_start[k].pop_back();
01805             match_end[k].pop_back();
01806           }
01807         }
01808       }
01809       else if ((tt==rg_orm)||(tt==rg_opn)||((tt>=rg_opa)&&(tt<=rg_opa9)))
01810       {
01811         // Gotta try next | section.
01812         m->buf_idx = m->buf_idx+1+GetValue16(m->buf_idx+1);
01813         tt = static_cast<RGTok>(buf[m->buf_idx]);
01814         if (tt==rg_orm)
01815         {
01816           // Try next | section
01817           idx++;
01818           if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01819           m1 = &matches[idx];
01820           m1->buf_idx = m->buf_idx+1+CSREGEX_OFSWIDTH;
01821           m1->start = m->start;
01822           m1->rpt = 0;
01823           state = 0;
01824         }
01825         else
01826         {
01827           // found close par.  So backtrack/remove this () set.
01828           // Gotta get the real high low.
01829           n = GetRpt(Next(m->buf_idx),low,high,lazy,prev);
01830           if (lazy)
01831           {
01832             // This is a bit hard to explain.
01833             // If this is the 1st repetition in a lazy expansion,
01834             // it will have m->rpt = 0 and we can just remove
01835             // it with idx--, and the lazy expansion will fail (regex-wise which is ok).
01836             // Other backtracking will resume normally.
01837             // But if it isn't the 1st item, then we need
01838             // to break through the closing ) of the previous
01839             // bracket set so that backtracking will continue.
01840             // This is because this is lazy backtracking (haha).
01841             // Anyways, if we just remove the current opening (,
01842             // when we loop around again and try to backtrack on a ),
01843             // since this is a lazy backtracking, it will try to go forward
01844             // because lazy backtracking is actually *expansion* and will
01845             // match another set of ().  THIS VERY SET WHICH JUST FAILED!!!
01846             // We'd end up in an endless loop.
01847             // So we break through the previous () set's closing ) to
01848             // continue the backtracking.
01849             par_index--;
01850             idx--;
01851             if (m->rpt!=0)
01852             {
01853               m = &matches[idx];
01854               par_index++;
01855               if (par_index+1>par.size()) par.resize(par_index);
01856               par[par_index].buf_idx = m->start;
01857               par[par_index].match_idx = m->end;
01858               if (tt!=rg_cpn)
01859               {
01860                 int k = tt-rg_cpa;
01861                 match_start[k].pop_back();
01862                 match_end[k].pop_back();
01863               }
01864               idx--;
01865             }
01866           }
01867           else if (m->rpt<low)
01868           {
01869             // Normal backtracking.
01870             par_index--;
01871             idx--;
01872           }
01873           else if ((m->rpt==0)&&(low==0))
01874           {
01875             // When shrinking greedy repetitions,
01876             // and we're on the first repetition,
01877             // and the empty set is valid, then we must try
01878             // the empty set.
01879             // Convert to rpt -1
01880             // m->rpt=-1 means empty set.
01881             m->end = m->start;
01882             m->rpt = -1;
01883             // Insert closing ) for empty set.
01884             idx++;
01885             if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01886             m1 = &matches[idx];
01887             m1->buf_idx = m->buf_idx;
01888             m1->start = par[par_index].buf_idx;
01889             m1->end = par[par_index].match_idx;
01890             tt = static_cast<RGTok>(buf[par[par_index].buf_idx]);
01891             if ((tt>=rg_opa)&&(tt<=rg_opa9))
01892             {
01893               unsigned int k = tt-rg_opa;
01894               match_start[k].push_back(m->start);
01895               match_end[k].push_back(m->end);
01896             }
01897             par_index--;
01898             m1->rpt = -1;
01899             // Insert item for next match.
01900             // Going forward now. Backtracking done.
01901             idx++;
01902             if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-2];}
01903             CSMatch *m2 = &matches[idx];
01904             m2->buf_idx = n;
01905             m2->start = m->start;
01906             m2->rpt = 0;
01907             state = 0;
01908           }
01909           else
01910           {
01911             // Done backtracking.
01912             // Go forward with next item.
01913             par_index--;
01914             m1 = &matches[idx-1]; // Previous item is )
01915             m->buf_idx = GetRpt(Next(m1->buf_idx),low,high,lazy,prev);
01916             m1 = &matches[m1->end]; // Get previous (
01917             m->start = m1->end;
01918             m->rpt = 0;
01919             state = 0;
01920           }
01921         }
01922       }
01923       else if ((lazy)&&(m->rpt+1==high))
01924       {
01925         // backtrack all the way.
01926         idx-=(m->rpt+1);
01927       }
01928       else if ((lazy)&&(m->rpt+1>=low))
01929       {
01930         if (m->rpt==-1)
01931         {
01932           // Expansion of the empty item.
01933           // -1 is empty while 0 is first item.
01934           m->rpt = 0;
01935         }
01936         else
01937         {
01938           // Normal repetition expansion.
01939           idx++;
01940           if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01941           m1 = &matches[idx];
01942           m1->buf_idx = m->buf_idx;
01943           m1->start = m->end;
01944           m1->rpt = m->rpt+1; // Increase repetition number.
01945         }
01946         backtrack = true;
01947         state = 0;
01948       }
01949       else if (m->rpt<low)
01950       {
01951         // This is normal backtracking.
01952         // Everything else is a special case.
01953         idx--;
01954       }
01955       else if ((m->rpt==0)&&(low==0))
01956       {
01957         // convert current repeat to 0 matches.
01958         // When we shrink past the 1st item and the empty item
01959         // is valid, we must try it.
01960         // We do that by simply setting m->rpt = -1
01961         // m->rpt = -1 mean empty item or set.
01962         m->end = m->start;
01963         m->rpt = -1;
01964         // Move forward to next item. Done backtracking.
01965         idx++;
01966         if (idx+1>=matches_size) {matches.resize(matches_size*=2); m=&matches[idx-1];}
01967         m1 = &matches[idx];
01968         m1->buf_idx = n;
01969         m1->start = m->end;
01970         m1->rpt = 0;
01971         state = 0;
01972       }
01973       else
01974       {
01975         // go forward.
01976         // This item ok.
01977         m1 = &matches[idx-1];
01978         m->buf_idx = GetRpt(Next(m1->buf_idx),low,high,lazy,prev);
01979         m->start = m1->end;
01980         m->rpt = 0;
01981         state = 0;
01982       }
01983     }
01984   }
01985   return false;
01986 }
01987 
01994 unsigned int CSRegEx::Next(unsigned int pos) const
01995 {
01996   RGTok t = static_cast<RGTok>(buf[pos]);
01997 
01998   unsigned int cpy = t;
01999   cpy>>=5;
02000 #ifdef _UNICODE
02001   // This takes some explaining.
02002   // For ASCII, the item byte is always 1.
02003   // Anything after this initial byte will be 2 bytes for offsets.
02004   // So if an item takes up 1 byte, then it's also 1 wchar_t in UNICODE
02005   // So if an item takes up 3 bytes, then it's 1 byte + 1 offset = 2 in UNCODE
02006   // So if an item takes up 5 bytes, then it's 1 byte + 2 offset = 3 in UNCODE
02007   // So if an item takes up 7 bytes, then it's 1 byte + 3 offset = 4 in UNCODE
02008   // Note that (1+1)/2 = 1
02009   // Note that (3+1)/2 = 2
02010   // Note that (5+1)/2 = 3
02011   // Note that (6+1)/2 = 4
02012   // So, we have to add 1 and divide by 2 to get the number
02013   // of wchar_t's in UNICODE.
02014   // And that's what we do here.
02015   cpy = (cpy+1+(t==rg_chr))>>1;
02016 #endif
02017   if (cpy==0)
02018   {
02019     if (t==rg_str)
02020     {
02021       cpy = GetValue16(pos+1)+1+CSREGEX_OFSWIDTH;
02022     }
02023     else if ((t==rg_grp)||(t==rg_grn))
02024     {
02025       cpy = GetValue16(pos+1)-1+2*(GetValue16(pos+1+CSREGEX_OFSWIDTH)-1)+1+2*CSREGEX_OFSWIDTH;
02026     }
02027   }
02028   return pos+cpy;
02029 }

Docs for CSRegEx created on Tue Dec 11 14:36:53 2007 by Doxygen 1.4.3


Webmaster: Cléo Saulnier