SF Area/Download
Docs:
User
Internal/Devel
Support This Project
SourceForge.net Logo

csregex.h

Go to the documentation of this file.
00001 //---------------------------------------------------------------------------
00002 //
00003 // Author: Cleo Saulnier
00004 // Date: July 14, 2005
00005 //
00006 // Regular expression engine.
00007 //
00008 // See LICENSE file for legal information.
00009 // Basically public domain, but author keeps moral rights.
00010 // Original author's name above must remain in source.
00011 // Simply put, you can do what you want, but don't misrepresent
00012 // the origins, and contents, of this code.  
00013 //
00014 // http://sourceforge.net/projects/csregex/
00015 //
00016 //---------------------------------------------------------------------------
00017 
00018 #ifndef csregexH
00019 #define csregexH
00020 
00021 #include <vector>
00022 
00023 #ifdef _UNICODE
00024 #include <tchar.h>
00025 #endif
00026 
00027 using namespace std;
00028 
00029 #ifndef _UNICODE
00030 #ifndef __TCHAR_DEFINED
00031 #define _TCHAR char
00032 #define _TUCHAR unsigned char
00033 #endif
00034 #endif
00035 
00041 // Internal compiled regular expression format.
00042 /*
00043 GRP chars ranges : Begin Group
00044 GRN chars ranges : Begin Group Negated
00045 
00046 DOT : match anything
00047 CHR char : match CHaR
00048 STR count char+ : match STRing
00049 BCK : match BaCKstring (0-9)
00050 
00051 OPA next : open parentheses (this OP has 10 values for each index).
00052 CPA : close parentheses
00053 OPN next : Open Parentheses, No backreference.
00054 CPN : Close Parentheses, No backreference.
00055 
00056 // Qualifiers
00057 RPT prev start end : Repeat matching.
00058 STA prev : * 0-inf
00059 PLS prev : + 1-inf
00060 QES prev : ? 0-1
00061 RPT_LAZY prev start end : Repeat matching.
00062 STA_LAZY prev : * 0-inf
00063 PLS_LAZY prev : + 1-inf
00064 QES_LAZY prev : ? 0-1
00065 
00066 ORM next : or matching
00067 
00068 HED : match beginning of string.
00069 TAL : match 0 TAiL
00070 
00071 END : end matching
00072 */
00073 
00074 
00100 class CSMatch
00101 {
00102 public:
00103   unsigned int buf_idx; 
00104   unsigned int start; 
00105   unsigned int end; 
00106   int rpt; 
00107 };
00108 
00118 class CSPar
00119 {
00120 public:
00121   unsigned int buf_idx; 
00122   unsigned int match_idx; 
00123 };
00124 
00240 class CSRegEx
00241 {
00242 private:
00249   enum RGTok {
00250     rg_end = 0, 
00251     rg_str = 1, 
00252     rg_grp = 2, 
00253     rg_grn = 3, 
00254     rg_hed = 32, 
00255     rg_tal = 33, 
00256     rg_dot = 34, 
00257     rg_cpn = 35, 
00258     rg_cpa = 36,  
00259     rg_cpa1 = 37, 
00260     rg_cpa2 = 38, 
00261     rg_cpa3 = 39, 
00262     rg_cpa4 = 40, 
00263     rg_cpa5 = 41, 
00264     rg_cpa6 = 42, 
00265     rg_cpa7 = 43, 
00266     rg_cpa8 = 44, 
00267     rg_cpa9 = 45, 
00268     rg_bck = 46,  
00269     rg_bck1 = 47, 
00270     rg_bck2 = 48, 
00271     rg_bck3 = 49, 
00272     rg_bck4 = 50, 
00273     rg_bck5 = 51, 
00274     rg_bck6 = 52, 
00275     rg_bck7 = 53, 
00276     rg_bck8 = 54, 
00277     rg_bck9 = 55, 
00278     rg_chr = 64, 
00279     rg_opa = 96, 
00280     rg_opa1 = 97, 
00281     rg_opa2 = 98, 
00282     rg_opa3 = 99, 
00283     rg_opa4 = 100, 
00284     rg_opa5 = 101, 
00285     rg_opa6 = 102, 
00286     rg_opa7 = 103, 
00287     rg_opa8 = 104, 
00288     rg_opa9 = 105, 
00289     rg_orm = 106, 
00290     rg_opn = 108, 
00291     rg_sta = 109, 
00292     rg_pls = 110, 
00293     rg_qes = 111, 
00294     rg_sta_lazy = 112, 
00295     rg_pls_lazy = 113, 
00296     rg_qes_lazy = 114, 
00297     rg_rpt = 224,     
00298     rg_rpt_lazy = 225, 
00299   };
00300 
00301   const _TUCHAR *str; 
00302   int str_len; 
00303   _TUCHAR *buf; 
00304   int buf_len; 
00305   int buf_size; 
00306   bool tal; 
00307   int opa_cnt; 
00308   vector<int> last_par; 
00309   vector<CSMatch> matches; 
00310   vector<CSPar> par; 
00311   vector<int> match_start[10]; 
00312   vector<int> match_end[10];  
00313   void Put(_TCHAR c); 
00314 #ifndef _UNICODE
00315   void PutValue16(unsigned short c); 
00316 #endif
00317   int Compile(int pos); 
00318   int Escape(int pos, _TUCHAR &c); 
00319   int CompileModifiers(int pos, int st_pos); 
00320   bool RangeAddChar(vector<_TUCHAR> &chars, vector<_TUCHAR> &start,
00321     vector<_TUCHAR> &end, _TUCHAR c); 
00322   bool RangeAddRange(vector<_TUCHAR> &chars, vector<_TUCHAR> &start,
00323     vector<_TUCHAR> &end, _TUCHAR cstart, _TUCHAR cend); 
00324   void CompressRange(vector<_TUCHAR> &chars, vector<_TUCHAR> &start,
00325     vector<_TUCHAR> &end); 
00326   int CompileRange(int pos); 
00327   void CheckString(int str_idx); 
00328   unsigned int Next(unsigned int pos) const; 
00329   unsigned int GetRpt(unsigned int buf_idx, int &low, int &high, bool &lazy, unsigned int &prev) const; 
00330   bool Match1(const _TUCHAR *str); 
00331 
00332 public:
00339   enum RGError {
00340     rge_ok=0, 
00341     rge_too_many_refs=1, 
00342     rge_missing_round_bracket=2, 
00343     rge_overlapping_chars=3, 
00344     rge_esc_eof=4, 
00345     rge_missing_square_bracket=5, 
00346     rge_invalid_esc_hex=6, 
00347     rge_invalid_repeat_format=7, 
00348     rge_invalid_repeat_range=8, 
00349     rge_unbalanced_round_bracket=9, 
00350     rge_invalid_range=10, 
00351     rge_invalid_backreference=11, 
00352     rge_regex_too_long=12 
00353   };
00354 
00355   int error; 
00356   enum RGError error_code; 
00357   _TCHAR *error_str; 
00358   bool bMatchHead; 
00359   int MatchStart;  
00360   int MatchEnd; 
00361   int BackStart[10]; 
00362   int BackEnd[10];   
00363 
00364   // There's no copy constructor, so don't use it.
00365   CSRegEx::CSRegEx();
00366   CSRegEx::~CSRegEx();
00367   bool Compile(const _TUCHAR *str); 
00368 #ifndef _UNICODE
00369   bool Compile(const char *str);
00370 #endif
00371   bool MatchRE(const _TUCHAR *str, const _TUCHAR *re); 
00372 #ifndef _UNICODE
00373   bool MatchRE(const char *str, const char *re);
00374   bool MatchRE(const unsigned char *str, const char *re);
00375   bool MatchRE(const char *str, const unsigned char *re);
00376 #endif
00377   bool Match(const _TUCHAR *str); 
00378 #ifndef _UNICODE
00379   bool Match(const char *str);
00380 #endif
00381   bool Match(const _TUCHAR *str, const _TUCHAR *cmp); 
00382 #ifndef _UNICODE
00383   bool Match(const char *str, const char *cmp);
00384   bool Match(const unsigned char *str, const char*cmp);
00385   bool Match(const char *str, const unsigned char *cmp);
00386 #endif
00387   _TUCHAR* GetCompiledString() const;  
00388   void SetCompiledString(_TUCHAR *str); 
00389 };
00390 #endif

Docs for CSRegEx created on Tue Dec 11 14:36:53 2007 by Doxygen 1.4.3


Webmaster: Cléo Saulnier