tinyxmlparser.cpp

00001 /*
00002 www.sourceforge.net/projects/tinyxml
00003 Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
00004 
00005 This software is provided 'as-is', without any express or implied 
00006 warranty. In no event will the authors be held liable for any 
00007 damages arising from the use of this software.
00008 
00009 Permission is granted to anyone to use this software for any 
00010 purpose, including commercial applications, and to alter it and 
00011 redistribute it freely, subject to the following restrictions:
00012 
00013 1. The origin of this software must not be misrepresented; you must 
00014 not claim that you wrote the original software. If you use this
00015 software in a product, an acknowledgment in the product documentation
00016 would be appreciated but is not required.
00017 
00018 2. Altered source versions must be plainly marked as such, and 
00019 must not be misrepresented as being the original software.
00020 
00021 3. This notice may not be removed or altered from any source 
00022 distribution.
00023 */
00024 
00025 #include <ctype.h>
00026 #include <stddef.h>
00027 
00028 #include "tinyxml.h"
00029 
00030 //#define DEBUG_PARSER
00031 #if defined( DEBUG_PARSER )
00032 #   if defined( DEBUG ) && defined( _MSC_VER )
00033 #       include <windows.h>
00034 #       define TIXML_LOG OutputDebugString
00035 #   else
00036 #       define TIXML_LOG printf
00037 #   endif
00038 #endif
00039 
00040 // Note tha "PutString" hardcodes the same list. This
00041 // is less flexible than it appears. Changing the entries
00042 // or order will break putstring.   
00043 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = 
00044 {
00045     { "&amp;",  5, '&' },
00046     { "&lt;",   4, '<' },
00047     { "&gt;",   4, '>' },
00048     { "&quot;", 6, '\"' },
00049     { "&apos;", 6, '\'' }
00050 };
00051 
00052 // Bunch of unicode info at:
00053 //      http://www.unicode.org/faq/utf_bom.html
00054 // Including the basic of this table, which determines the #bytes in the
00055 // sequence from the lead byte. 1 placed for invalid sequences --
00056 // although the result will be junk, pass it through as much as possible.
00057 // Beware of the non-characters in UTF-8:   
00058 //              ef bb bf (Microsoft "lead bytes")
00059 //              ef bf be
00060 //              ef bf bf 
00061 
00062 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
00063 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
00064 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
00065 
00066 const int TiXmlBase::utf8ByteTable[256] = 
00067 {
00068     //  0   1   2   3   4   5   6   7   8   9   a   b   c   d   e   f
00069         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0x00
00070         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0x10
00071         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0x20
00072         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0x30
00073         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0x40
00074         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0x50
00075         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0x60
00076         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0x70 End of ASCII range
00077         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0x80 0x80 to 0xc1 invalid
00078         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0x90 
00079         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0xa0 
00080         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  // 0xb0 
00081         1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  // 0xc0 0xc2 to 0xdf 2 byte
00082         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  // 0xd0
00083         3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  // 0xe0 0xe0 to 0xef 3 byte
00084         4,  4,  4,  4,  4,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1   // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
00085 };
00086 
00087 
00088 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
00089 {
00090     const unsigned long BYTE_MASK = 0xBF;
00091     const unsigned long BYTE_MARK = 0x80;
00092     const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00093 
00094     if (input < 0x80) 
00095         *length = 1;
00096     else if ( input < 0x800 )
00097         *length = 2;
00098     else if ( input < 0x10000 )
00099         *length = 3;
00100     else if ( input < 0x200000 )
00101         *length = 4;
00102     else
00103         { *length = 0; return; }    // This code won't covert this correctly anyway.
00104 
00105     output += *length;
00106 
00107     // Scary scary fall throughs.
00108     switch (*length) 
00109     {
00110         case 4:
00111             --output; 
00112             *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
00113             input >>= 6;
00114         case 3:
00115             --output; 
00116             *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
00117             input >>= 6;
00118         case 2:
00119             --output; 
00120             *output = (char)((input | BYTE_MARK) & BYTE_MASK); 
00121             input >>= 6;
00122         case 1:
00123             --output; 
00124             *output = (char)(input | FIRST_BYTE_MARK[*length]);
00125     }
00126 }
00127 
00128 
00129 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
00130 {
00131     // This will only work for low-ascii, everything else is assumed to be a valid
00132     // letter. I'm not sure this is the best approach, but it is quite tricky trying
00133     // to figure out alhabetical vs. not across encoding. So take a very 
00134     // conservative approach.
00135 
00136 //  if ( encoding == TIXML_ENCODING_UTF8 )
00137 //  {
00138         if ( anyByte < 127 )
00139             return isalpha( anyByte );
00140         else
00141             return 1;   // What else to do? The unicode set is huge...get the english ones right.
00142 //  }
00143 //  else
00144 //  {
00145 //      return isalpha( anyByte );
00146 //  }
00147 }
00148 
00149 
00150 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
00151 {
00152     // This will only work for low-ascii, everything else is assumed to be a valid
00153     // letter. I'm not sure this is the best approach, but it is quite tricky trying
00154     // to figure out alhabetical vs. not across encoding. So take a very 
00155     // conservative approach.
00156 
00157 //  if ( encoding == TIXML_ENCODING_UTF8 )
00158 //  {
00159         if ( anyByte < 127 )
00160             return isalnum( anyByte );
00161         else
00162             return 1;   // What else to do? The unicode set is huge...get the english ones right.
00163 //  }
00164 //  else
00165 //  {
00166 //      return isalnum( anyByte );
00167 //  }
00168 }
00169 
00170 
00171 class TiXmlParsingData
00172 {
00173     friend class TiXmlDocument;
00174   public:
00175     void Stamp( const char* now, TiXmlEncoding encoding );
00176 
00177     const TiXmlCursor& Cursor() { return cursor; }
00178 
00179   private:
00180     // Only used by the document!
00181     TiXmlParsingData( const char* start, int _tabsize, int row, int col )
00182     {
00183         assert( start );
00184         stamp = start;
00185         tabsize = _tabsize;
00186         cursor.row = row;
00187         cursor.col = col;
00188     }
00189 
00190     TiXmlCursor     cursor;
00191     const char*     stamp;
00192     int             tabsize;
00193 };
00194 
00195 
00196 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
00197 {
00198     assert( now );
00199 
00200     // Do nothing if the tabsize is 0.
00201     if ( tabsize < 1 )
00202     {
00203         return;
00204     }
00205 
00206     // Get the current row, column.
00207     int row = cursor.row;
00208     int col = cursor.col;
00209     const char* p = stamp;
00210     assert( p );
00211 
00212     while ( p < now )
00213     {
00214         // Treat p as unsigned, so we have a happy compiler.
00215         const unsigned char* pU = (const unsigned char*)p;
00216 
00217         // Code contributed by Fletcher Dunn: (modified by lee)
00218         switch (*pU) {
00219             case 0:
00220                 // We *should* never get here, but in case we do, don't
00221                 // advance past the terminating null character, ever
00222                 return;
00223 
00224             case '\r':
00225                 // bump down to the next line
00226                 ++row;
00227                 col = 0;                
00228                 // Eat the character
00229                 ++p;
00230 
00231                 // Check for \r\n sequence, and treat this as a single character
00232                 if (*p == '\n') {
00233                     ++p;
00234                 }
00235                 break;
00236 
00237             case '\n':
00238                 // bump down to the next line
00239                 ++row;
00240                 col = 0;
00241 
00242                 // Eat the character
00243                 ++p;
00244 
00245                 // Check for \n\r sequence, and treat this as a single
00246                 // character.  (Yes, this bizarre thing does occur still
00247                 // on some arcane platforms...)
00248                 if (*p == '\r') {
00249                     ++p;
00250                 }
00251                 break;
00252 
00253             case '\t':
00254                 // Eat the character
00255                 ++p;
00256 
00257                 // Skip to next tab stop
00258                 col = (col / tabsize + 1) * tabsize;
00259                 break;
00260 
00261             case TIXML_UTF_LEAD_0:
00262                 if ( encoding == TIXML_ENCODING_UTF8 )
00263                 {
00264                     if ( *(p+1) && *(p+2) )
00265                     {
00266                         // In these cases, don't advance the column. These are
00267                         // 0-width spaces.
00268                         if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
00269                             p += 3; 
00270                         else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
00271                             p += 3; 
00272                         else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
00273                             p += 3; 
00274                         else
00275                             { p +=3; ++col; }   // A normal character.
00276                     }
00277                 }
00278                 else
00279                 {
00280                     ++p;
00281                     ++col;
00282                 }
00283                 break;
00284 
00285             default:
00286                 if ( encoding == TIXML_ENCODING_UTF8 )
00287                 {
00288                     // Eat the 1 to 4 byte utf8 character.
00289                     int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
00290                     if ( step == 0 )
00291                         step = 1;       // Error case from bad encoding, but handle gracefully.
00292                     p += step;
00293 
00294                     // Just advance one column, of course.
00295                     ++col;
00296                 }
00297                 else
00298                 {
00299                     ++p;
00300                     ++col;
00301                 }
00302                 break;
00303         }
00304     }
00305     cursor.row = row;
00306     cursor.col = col;
00307     assert( cursor.row >= -1 );
00308     assert( cursor.col >= -1 );
00309     stamp = p;
00310     assert( stamp );
00311 }
00312 
00313 
00314 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
00315 {
00316     if ( !p || !*p )
00317     {
00318         return 0;
00319     }
00320     if ( encoding == TIXML_ENCODING_UTF8 )
00321     {
00322         while ( *p )
00323         {
00324             const unsigned char* pU = (const unsigned char*)p;
00325             
00326             // Skip the stupid Microsoft UTF-8 Byte order marks
00327             if (    *(pU+0)==TIXML_UTF_LEAD_0
00328                  && *(pU+1)==TIXML_UTF_LEAD_1 
00329                  && *(pU+2)==TIXML_UTF_LEAD_2 )
00330             {
00331                 p += 3;
00332                 continue;
00333             }
00334             else if(*(pU+0)==TIXML_UTF_LEAD_0
00335                  && *(pU+1)==0xbfU
00336                  && *(pU+2)==0xbeU )
00337             {
00338                 p += 3;
00339                 continue;
00340             }
00341             else if(*(pU+0)==TIXML_UTF_LEAD_0
00342                  && *(pU+1)==0xbfU
00343                  && *(pU+2)==0xbfU )
00344             {
00345                 p += 3;
00346                 continue;
00347             }
00348 
00349             if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )        // Still using old rules for white space.
00350                 ++p;
00351             else
00352                 break;
00353         }
00354     }
00355     else
00356     {
00357         while ( *p && IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )
00358             ++p;
00359     }
00360 
00361     return p;
00362 }
00363 
00364 #ifdef TIXML_USE_STL
00365 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
00366 {
00367     for( ;; )
00368     {
00369         if ( !in->good() ) return false;
00370 
00371         int c = in->peek();
00372         // At this scope, we can't get to a document. So fail silently.
00373         if ( !IsWhiteSpace( c ) || c <= 0 )
00374             return true;
00375 
00376         *tag += (char) in->get();
00377     }
00378 }
00379 
00380 /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
00381 {
00382     //assert( character > 0 && character < 128 );   // else it won't work in utf-8
00383     while ( in->good() )
00384     {
00385         int c = in->peek();
00386         if ( c == character )
00387             return true;
00388         if ( c <= 0 )       // Silent failure: can't get document at this scope
00389             return false;
00390 
00391         in->get();
00392         *tag += (char) c;
00393     }
00394     return false;
00395 }
00396 #endif
00397 
00398 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
00399 // "assign" optimization removes over 10% of the execution time.
00400 //
00401 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
00402 {
00403     // Oddly, not supported on some comilers,
00404     //name->clear();
00405     // So use this:
00406     *name = "";
00407     assert( p );
00408 
00409     // Names start with letters or underscores.
00410     // Of course, in unicode, tinyxml has no idea what a letter *is*. The
00411     // algorithm is generous.
00412     //
00413     // After that, they can be letters, underscores, numbers,
00414     // hyphens, or colons. (Colons are valid ony for namespaces,
00415     // but tinyxml can't tell namespaces from names.)
00416     if (    p && *p 
00417          && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
00418     {
00419         const char* start = p;
00420         while(      p && *p
00421                 &&  (       IsAlphaNum( (unsigned char ) *p, encoding ) 
00422                          || *p == '_'
00423                          || *p == '-'
00424                          || *p == '.'
00425                          || *p == ':' ) )
00426         {
00427             //(*name) += *p; // expensive
00428             ++p;
00429         }
00430         if ( p-start > 0 ) {
00431             name->assign( start, p-start );
00432         }
00433         return p;
00434     }
00435     return 0;
00436 }
00437 
00438 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
00439 {
00440     // Presume an entity, and pull it out.
00441     TIXML_STRING ent;
00442     int i;
00443     *length = 0;
00444 
00445     if ( *(p+1) && *(p+1) == '#' && *(p+2) )
00446     {
00447         unsigned long ucs = 0;
00448         ptrdiff_t delta = 0;
00449         unsigned mult = 1;
00450 
00451         if ( *(p+2) == 'x' )
00452         {
00453             // Hexadecimal.
00454             if ( !*(p+3) ) return 0;
00455 
00456             const char* q = p+3;
00457             q = strchr( q, ';' );
00458 
00459             if ( !q || !*q ) return 0;
00460 
00461             delta = q-p;
00462             --q;
00463 
00464             while ( *q != 'x' )
00465             {
00466                 if ( *q >= '0' && *q <= '9' )
00467                     ucs += mult * (*q - '0');
00468                 else if ( *q >= 'a' && *q <= 'f' )
00469                     ucs += mult * (*q - 'a' + 10);
00470                 else if ( *q >= 'A' && *q <= 'F' )
00471                     ucs += mult * (*q - 'A' + 10 );
00472                 else 
00473                     return 0;
00474                 mult *= 16;
00475                 --q;
00476             }
00477         }
00478         else
00479         {
00480             // Decimal.
00481             if ( !*(p+2) ) return 0;
00482 
00483             const char* q = p+2;
00484             q = strchr( q, ';' );
00485 
00486             if ( !q || !*q ) return 0;
00487 
00488             delta = q-p;
00489             --q;
00490 
00491             while ( *q != '#' )
00492             {
00493                 if ( *q >= '0' && *q <= '9' )
00494                     ucs += mult * (*q - '0');
00495                 else 
00496                     return 0;
00497                 mult *= 10;
00498                 --q;
00499             }
00500         }
00501         if ( encoding == TIXML_ENCODING_UTF8 )
00502         {
00503             // convert the UCS to UTF-8
00504             ConvertUTF32ToUTF8( ucs, value, length );
00505         }
00506         else
00507         {
00508             *value = (char)ucs;
00509             *length = 1;
00510         }
00511         return p + delta + 1;
00512     }
00513 
00514     // Now try to match it.
00515     for( i=0; i<NUM_ENTITY; ++i )
00516     {
00517         if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
00518         {
00519             assert( strlen( entity[i].str ) == entity[i].strLength );
00520             *value = entity[i].chr;
00521             *length = 1;
00522             return ( p + entity[i].strLength );
00523         }
00524     }
00525 
00526     // So it wasn't an entity, its unrecognized, or something like that.
00527     *value = *p;    // Don't put back the last one, since we return it!
00528     //*length = 1;  // Leave unrecognized entities - this doesn't really work.
00529                     // Just writes strange XML.
00530     return p+1;
00531 }
00532 
00533 
00534 bool TiXmlBase::StringEqual( const char* p,
00535                              const char* tag,
00536                              bool ignoreCase,
00537                              TiXmlEncoding encoding )
00538 {
00539     assert( p );
00540     assert( tag );
00541     if ( !p || !*p )
00542     {
00543         assert( 0 );
00544         return false;
00545     }
00546 
00547     const char* q = p;
00548 
00549     if ( ignoreCase )
00550     {
00551         while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
00552         {
00553             ++q;
00554             ++tag;
00555         }
00556 
00557         if ( *tag == 0 )
00558             return true;
00559     }
00560     else
00561     {
00562         while ( *q && *tag && *q == *tag )
00563         {
00564             ++q;
00565             ++tag;
00566         }
00567 
00568         if ( *tag == 0 )        // Have we found the end of the tag, and everything equal?
00569             return true;
00570     }
00571     return false;
00572 }
00573 
00574 const char* TiXmlBase::ReadText(    const char* p, 
00575                                     TIXML_STRING * text, 
00576                                     bool trimWhiteSpace, 
00577                                     const char* endTag, 
00578                                     bool caseInsensitive,
00579                                     TiXmlEncoding encoding )
00580 {
00581     *text = "";
00582     if (    !trimWhiteSpace         // certain tags always keep whitespace
00583          || !condenseWhiteSpace )   // if true, whitespace is always kept
00584     {
00585         // Keep all the white space.
00586         while (    p && *p
00587                 && !StringEqual( p, endTag, caseInsensitive, encoding )
00588               )
00589         {
00590             int len;
00591             char cArr[4] = { 0, 0, 0, 0 };
00592             p = GetChar( p, cArr, &len, encoding );
00593             text->append( cArr, len );
00594         }
00595     }
00596     else
00597     {
00598         bool whitespace = false;
00599 
00600         // Remove leading white space:
00601         p = SkipWhiteSpace( p, encoding );
00602         while (    p && *p
00603                 && !StringEqual( p, endTag, caseInsensitive, encoding ) )
00604         {
00605             if ( *p == '\r' || *p == '\n' )
00606             {
00607                 whitespace = true;
00608                 ++p;
00609             }
00610             else if ( IsWhiteSpace( *p ) )
00611             {
00612                 whitespace = true;
00613                 ++p;
00614             }
00615             else
00616             {
00617                 // If we've found whitespace, add it before the
00618                 // new character. Any whitespace just becomes a space.
00619                 if ( whitespace )
00620                 {
00621                     (*text) += ' ';
00622                     whitespace = false;
00623                 }
00624                 int len;
00625                 char cArr[4] = { 0, 0, 0, 0 };
00626                 p = GetChar( p, cArr, &len, encoding );
00627                 if ( len == 1 )
00628                     (*text) += cArr[0]; // more efficient
00629                 else
00630                     text->append( cArr, len );
00631             }
00632         }
00633     }
00634     if ( p ) 
00635         p += strlen( endTag );
00636     return p;
00637 }
00638 
00639 #ifdef TIXML_USE_STL
00640 
00641 void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
00642 {
00643     // The basic issue with a document is that we don't know what we're
00644     // streaming. Read something presumed to be a tag (and hope), then
00645     // identify it, and call the appropriate stream method on the tag.
00646     //
00647     // This "pre-streaming" will never read the closing ">" so the
00648     // sub-tag can orient itself.
00649 
00650     if ( !StreamTo( in, '<', tag ) ) 
00651     {
00652         SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00653         return;
00654     }
00655 
00656     while ( in->good() )
00657     {
00658         int tagIndex = (int) tag->length();
00659         while ( in->good() && in->peek() != '>' )
00660         {
00661             int c = in->get();
00662             if ( c <= 0 )
00663             {
00664                 SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00665                 break;
00666             }
00667             (*tag) += (char) c;
00668         }
00669 
00670         if ( in->good() )
00671         {
00672             // We now have something we presume to be a node of 
00673             // some sort. Identify it, and call the node to
00674             // continue streaming.
00675             TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
00676 
00677             if ( node )
00678             {
00679                 node->StreamIn( in, tag );
00680                 bool isElement = node->ToElement() != 0;
00681                 delete node;
00682                 node = 0;
00683 
00684                 // If this is the root element, we're done. Parsing will be
00685                 // done by the >> operator.
00686                 if ( isElement )
00687                 {
00688                     return;
00689                 }
00690             }
00691             else
00692             {
00693                 SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
00694                 return;
00695             }
00696         }
00697     }
00698     // We should have returned sooner.
00699     SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
00700 }
00701 
00702 #endif
00703 
00704 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
00705 {
00706     ClearError();
00707 
00708     // Parse away, at the document level. Since a document
00709     // contains nothing but other tags, most of what happens
00710     // here is skipping white space.
00711     if ( !p || !*p )
00712     {
00713         SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00714         return 0;
00715     }
00716 
00717     // Note that, for a document, this needs to come
00718     // before the while space skip, so that parsing
00719     // starts from the pointer we are given.
00720     location.Clear();
00721     if ( prevData )
00722     {
00723         location.row = prevData->cursor.row;
00724         location.col = prevData->cursor.col;
00725     }
00726     else
00727     {
00728         location.row = 0;
00729         location.col = 0;
00730     }
00731     TiXmlParsingData data( p, TabSize(), location.row, location.col );
00732     location = data.Cursor();
00733 
00734     if ( encoding == TIXML_ENCODING_UNKNOWN )
00735     {
00736         // Check for the Microsoft UTF-8 lead bytes.
00737         const unsigned char* pU = (const unsigned char*)p;
00738         if (    *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
00739              && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
00740              && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
00741         {
00742             encoding = TIXML_ENCODING_UTF8;
00743             useMicrosoftBOM = true;
00744         }
00745     }
00746 
00747     p = SkipWhiteSpace( p, encoding );
00748     if ( !p )
00749     {
00750         SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
00751         return 0;
00752     }
00753 
00754     while ( p && *p )
00755     {
00756         TiXmlNode* node = Identify( p, encoding );
00757         if ( node )
00758         {
00759             p = node->Parse( p, &data, encoding );
00760             LinkEndChild( node );
00761         }
00762         else
00763         {
00764             break;
00765         }
00766 
00767         // Did we get encoding info?
00768         if (    encoding == TIXML_ENCODING_UNKNOWN
00769              && node->ToDeclaration() )
00770         {
00771             TiXmlDeclaration* dec = node->ToDeclaration();
00772             const char* enc = dec->Encoding();
00773             assert( enc );
00774 
00775             if ( *enc == 0 )
00776                 encoding = TIXML_ENCODING_UTF8;
00777             else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
00778                 encoding = TIXML_ENCODING_UTF8;
00779             else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
00780                 encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
00781             else 
00782                 encoding = TIXML_ENCODING_LEGACY;
00783         }
00784 
00785         p = SkipWhiteSpace( p, encoding );
00786     }
00787 
00788     // Was this empty?
00789     if ( !firstChild ) {
00790         SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
00791         return 0;
00792     }
00793 
00794     // All is well.
00795     return p;
00796 }
00797 
00798 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
00799 {   
00800     // The first error in a chain is more accurate - don't set again!
00801     if ( error )
00802         return;
00803 
00804     assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
00805     error   = true;
00806     errorId = err;
00807     errorDesc = errorString[ errorId ];
00808 
00809     errorLocation.Clear();
00810     if ( pError && data )
00811     {
00812         data->Stamp( pError, encoding );
00813         errorLocation = data->Cursor();
00814     }
00815 }
00816 
00817 
00818 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
00819 {
00820     TiXmlNode* returnNode = 0;
00821 
00822     p = SkipWhiteSpace( p, encoding );
00823     if( !p || !*p || *p != '<' )
00824     {
00825         return 0;
00826     }
00827 
00828     TiXmlDocument* doc = GetDocument();
00829     p = SkipWhiteSpace( p, encoding );
00830 
00831     if ( !p || !*p )
00832     {
00833         return 0;
00834     }
00835 
00836     // What is this thing? 
00837     // - Elements start with a letter or underscore, but xml is reserved.
00838     // - Comments: <!--
00839     // - Decleration: <?xml
00840     // - Everthing else is unknown to tinyxml.
00841     //
00842 
00843     const char* xmlHeader = { "<?xml" };
00844     const char* commentHeader = { "<!--" };
00845     const char* dtdHeader = { "<!" };
00846     const char* cdataHeader = { "<![CDATA[" };
00847 
00848     if ( StringEqual( p, xmlHeader, true, encoding ) )
00849     {
00850         #ifdef DEBUG_PARSER
00851             TIXML_LOG( "XML parsing Declaration\n" );
00852         #endif
00853         returnNode = new TiXmlDeclaration();
00854     }
00855     else if ( StringEqual( p, commentHeader, false, encoding ) )
00856     {
00857         #ifdef DEBUG_PARSER
00858             TIXML_LOG( "XML parsing Comment\n" );
00859         #endif
00860         returnNode = new TiXmlComment();
00861     }
00862     else if ( StringEqual( p, cdataHeader, false, encoding ) )
00863     {
00864         #ifdef DEBUG_PARSER
00865             TIXML_LOG( "XML parsing CDATA\n" );
00866         #endif
00867         TiXmlText* text = new TiXmlText( "" );
00868         text->SetCDATA( true );
00869         returnNode = text;
00870     }
00871     else if ( StringEqual( p, dtdHeader, false, encoding ) )
00872     {
00873         #ifdef DEBUG_PARSER
00874             TIXML_LOG( "XML parsing Unknown(1)\n" );
00875         #endif
00876         returnNode = new TiXmlUnknown();
00877     }
00878     else if (    IsAlpha( *(p+1), encoding )
00879               || *(p+1) == '_' )
00880     {
00881         #ifdef DEBUG_PARSER
00882             TIXML_LOG( "XML parsing Element\n" );
00883         #endif
00884         returnNode = new TiXmlElement( "" );
00885     }
00886     else
00887     {
00888         #ifdef DEBUG_PARSER
00889             TIXML_LOG( "XML parsing Unknown(2)\n" );
00890         #endif
00891         returnNode = new TiXmlUnknown();
00892     }
00893 
00894     if ( returnNode )
00895     {
00896         // Set the parent, so it can report errors
00897         returnNode->parent = this;
00898     }
00899     else
00900     {
00901         if ( doc )
00902             doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
00903     }
00904     return returnNode;
00905 }
00906 
00907 #ifdef TIXML_USE_STL
00908 
00909 void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
00910 {
00911     // We're called with some amount of pre-parsing. That is, some of "this"
00912     // element is in "tag". Go ahead and stream to the closing ">"
00913     while( in->good() )
00914     {
00915         int c = in->get();
00916         if ( c <= 0 )
00917         {
00918             TiXmlDocument* document = GetDocument();
00919             if ( document )
00920                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00921             return;
00922         }
00923         (*tag) += (char) c ;
00924         
00925         if ( c == '>' )
00926             break;
00927     }
00928 
00929     if ( tag->length() < 3 ) return;
00930 
00931     // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
00932     // If not, identify and stream.
00933 
00934     if (    tag->at( tag->length() - 1 ) == '>' 
00935          && tag->at( tag->length() - 2 ) == '/' )
00936     {
00937         // All good!
00938         return;
00939     }
00940     else if ( tag->at( tag->length() - 1 ) == '>' )
00941     {
00942         // There is more. Could be:
00943         //      text
00944         //      cdata text (which looks like another node)
00945         //      closing tag
00946         //      another node.
00947         for ( ;; )
00948         {
00949             StreamWhiteSpace( in, tag );
00950 
00951             // Do we have text?
00952             if ( in->good() && in->peek() != '<' ) 
00953             {
00954                 // Yep, text.
00955                 TiXmlText text( "" );
00956                 text.StreamIn( in, tag );
00957 
00958                 // What follows text is a closing tag or another node.
00959                 // Go around again and figure it out.
00960                 continue;
00961             }
00962 
00963             // We now have either a closing tag...or another node.
00964             // We should be at a "<", regardless.
00965             if ( !in->good() ) return;
00966             assert( in->peek() == '<' );
00967             int tagIndex = (int) tag->length();
00968 
00969             bool closingTag = false;
00970             bool firstCharFound = false;
00971 
00972             for( ;; )
00973             {
00974                 if ( !in->good() )
00975                     return;
00976 
00977                 int c = in->peek();
00978                 if ( c <= 0 )
00979                 {
00980                     TiXmlDocument* document = GetDocument();
00981                     if ( document )
00982                         document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
00983                     return;
00984                 }
00985                 
00986                 if ( c == '>' )
00987                     break;
00988 
00989                 *tag += (char) c;
00990                 in->get();
00991 
00992                 // Early out if we find the CDATA id.
00993                 if ( c == '[' && tag->size() >= 9 )
00994                 {
00995                     size_t len = tag->size();
00996                     const char* start = tag->c_str() + len - 9;
00997                     if ( strcmp( start, "<![CDATA[" ) == 0 ) {
00998                         assert( !closingTag );
00999                         break;
01000                     }
01001                 }
01002 
01003                 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
01004                 {
01005                     firstCharFound = true;
01006                     if ( c == '/' )
01007                         closingTag = true;
01008                 }
01009             }
01010             // If it was a closing tag, then read in the closing '>' to clean up the input stream.
01011             // If it was not, the streaming will be done by the tag.
01012             if ( closingTag )
01013             {
01014                 if ( !in->good() )
01015                     return;
01016 
01017                 int c = in->get();
01018                 if ( c <= 0 )
01019                 {
01020                     TiXmlDocument* document = GetDocument();
01021                     if ( document )
01022                         document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01023                     return;
01024                 }
01025                 assert( c == '>' );
01026                 *tag += (char) c;
01027 
01028                 // We are done, once we've found our closing tag.
01029                 return;
01030             }
01031             else
01032             {
01033                 // If not a closing tag, id it, and stream.
01034                 const char* tagloc = tag->c_str() + tagIndex;
01035                 TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
01036                 if ( !node )
01037                     return;
01038                 node->StreamIn( in, tag );
01039                 delete node;
01040                 node = 0;
01041 
01042                 // No return: go around from the beginning: text, closing tag, or node.
01043             }
01044         }
01045     }
01046 }
01047 #endif
01048 
01049 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01050 {
01051     p = SkipWhiteSpace( p, encoding );
01052     TiXmlDocument* document = GetDocument();
01053 
01054     if ( !p || !*p )
01055     {
01056         if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
01057         return 0;
01058     }
01059 
01060     if ( data )
01061     {
01062         data->Stamp( p, encoding );
01063         location = data->Cursor();
01064     }
01065 
01066     if ( *p != '<' )
01067     {
01068         if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
01069         return 0;
01070     }
01071 
01072     p = SkipWhiteSpace( p+1, encoding );
01073 
01074     // Read the name.
01075     const char* pErr = p;
01076 
01077     p = ReadName( p, &value, encoding );
01078     if ( !p || !*p )
01079     {
01080         if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
01081         return 0;
01082     }
01083 
01084     TIXML_STRING endTag ("</");
01085     endTag += value;
01086     endTag += ">";
01087 
01088     // Check for and read attributes. Also look for an empty
01089     // tag or an end tag.
01090     while ( p && *p )
01091     {
01092         pErr = p;
01093         p = SkipWhiteSpace( p, encoding );
01094         if ( !p || !*p )
01095         {
01096             if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
01097             return 0;
01098         }
01099         if ( *p == '/' )
01100         {
01101             ++p;
01102             // Empty tag.
01103             if ( *p  != '>' )
01104             {
01105                 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );     
01106                 return 0;
01107             }
01108             return (p+1);
01109         }
01110         else if ( *p == '>' )
01111         {
01112             // Done with attributes (if there were any.)
01113             // Read the value -- which can include other
01114             // elements -- read the end tag, and return.
01115             ++p;
01116             p = ReadValue( p, data, encoding );     // Note this is an Element method, and will set the error if one happens.
01117             if ( !p || !*p ) {
01118                 // We were looking for the end tag, but found nothing.
01119                 // Fix for [ 1663758 ] Failure to report error on bad XML
01120                 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
01121                 return 0;
01122             }
01123 
01124             // We should find the end tag now
01125             if ( StringEqual( p, endTag.c_str(), false, encoding ) )
01126             {
01127                 p += endTag.length();
01128                 return p;
01129             }
01130             else
01131             {
01132                 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
01133                 return 0;
01134             }
01135         }
01136         else
01137         {
01138             // Try to read an attribute:
01139             TiXmlAttribute* attrib = new TiXmlAttribute();
01140             if ( !attrib )
01141             {
01142                 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
01143                 return 0;
01144             }
01145 
01146             attrib->SetDocument( document );
01147             pErr = p;
01148             p = attrib->Parse( p, data, encoding );
01149 
01150             if ( !p || !*p )
01151             {
01152                 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
01153                 delete attrib;
01154                 return 0;
01155             }
01156 
01157             // Handle the strange case of double attributes:
01158             #ifdef TIXML_USE_STL
01159             TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
01160             #else
01161             TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
01162             #endif
01163             if ( node )
01164             {
01165                 node->SetValue( attrib->Value() );
01166                 delete attrib;
01167                 return 0;
01168             }
01169 
01170             attributeSet.Add( attrib );
01171         }
01172     }
01173     return p;
01174 }
01175 
01176 
01177 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01178 {
01179     TiXmlDocument* document = GetDocument();
01180 
01181     // Read in text and elements in any order.
01182     const char* pWithWhiteSpace = p;
01183     p = SkipWhiteSpace( p, encoding );
01184 
01185     while ( p && *p )
01186     {
01187         if ( *p != '<' )
01188         {
01189             // Take what we have, make a text element.
01190             TiXmlText* textNode = new TiXmlText( "" );
01191 
01192             if ( !textNode )
01193             {
01194                 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
01195                     return 0;
01196             }
01197 
01198             if ( TiXmlBase::IsWhiteSpaceCondensed() )
01199             {
01200                 p = textNode->Parse( p, data, encoding );
01201             }
01202             else
01203             {
01204                 // Special case: we want to keep the white space
01205                 // so that leading spaces aren't removed.
01206                 p = textNode->Parse( pWithWhiteSpace, data, encoding );
01207             }
01208 
01209             if ( !textNode->Blank() )
01210                 LinkEndChild( textNode );
01211             else
01212                 delete textNode;
01213         } 
01214         else 
01215         {
01216             // We hit a '<'
01217             // Have we hit a new element or an end tag? This could also be
01218             // a TiXmlText in the "CDATA" style.
01219             if ( StringEqual( p, "</", false, encoding ) )
01220             {
01221                 return p;
01222             }
01223             else
01224             {
01225                 TiXmlNode* node = Identify( p, encoding );
01226                 if ( node )
01227                 {
01228                     p = node->Parse( p, data, encoding );
01229                     LinkEndChild( node );
01230                 }               
01231                 else
01232                 {
01233                     return 0;
01234                 }
01235             }
01236         }
01237         pWithWhiteSpace = p;
01238         p = SkipWhiteSpace( p, encoding );
01239     }
01240 
01241     if ( !p )
01242     {
01243         if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
01244     }   
01245     return p;
01246 }
01247 
01248 
01249 #ifdef TIXML_USE_STL
01250 void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
01251 {
01252     while ( in->good() )
01253     {
01254         int c = in->get();  
01255         if ( c <= 0 )
01256         {
01257             TiXmlDocument* document = GetDocument();
01258             if ( document )
01259                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01260             return;
01261         }
01262         (*tag) += (char) c;
01263 
01264         if ( c == '>' )
01265         {
01266             // All is well.
01267             return;     
01268         }
01269     }
01270 }
01271 #endif
01272 
01273 
01274 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01275 {
01276     TiXmlDocument* document = GetDocument();
01277     p = SkipWhiteSpace( p, encoding );
01278 
01279     if ( data )
01280     {
01281         data->Stamp( p, encoding );
01282         location = data->Cursor();
01283     }
01284     if ( !p || !*p || *p != '<' )
01285     {
01286         if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
01287         return 0;
01288     }
01289     ++p;
01290     value = "";
01291 
01292     while ( p && *p && *p != '>' )
01293     {
01294         value += *p;
01295         ++p;
01296     }
01297 
01298     if ( !p )
01299     {
01300         if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
01301     }
01302     if ( *p == '>' )
01303         return p+1;
01304     return p;
01305 }
01306 
01307 #ifdef TIXML_USE_STL
01308 void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
01309 {
01310     while ( in->good() )
01311     {
01312         int c = in->get();  
01313         if ( c <= 0 )
01314         {
01315             TiXmlDocument* document = GetDocument();
01316             if ( document )
01317                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01318             return;
01319         }
01320 
01321         (*tag) += (char) c;
01322 
01323         if ( c == '>' 
01324              && tag->at( tag->length() - 2 ) == '-'
01325              && tag->at( tag->length() - 3 ) == '-' )
01326         {
01327             // All is well.
01328             return;     
01329         }
01330     }
01331 }
01332 #endif
01333 
01334 
01335 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01336 {
01337     TiXmlDocument* document = GetDocument();
01338     value = "";
01339 
01340     p = SkipWhiteSpace( p, encoding );
01341 
01342     if ( data )
01343     {
01344         data->Stamp( p, encoding );
01345         location = data->Cursor();
01346     }
01347     const char* startTag = "<!--";
01348     const char* endTag   = "-->";
01349 
01350     if ( !StringEqual( p, startTag, false, encoding ) )
01351     {
01352         document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
01353         return 0;
01354     }
01355     p += strlen( startTag );
01356 
01357     // [ 1475201 ] TinyXML parses entities in comments
01358     // Oops - ReadText doesn't work, because we don't want to parse the entities.
01359     // p = ReadText( p, &value, false, endTag, false, encoding );
01360     //
01361     // from the XML spec:
01362     /*
01363      [Definition: Comments may appear anywhere in a document outside other markup; in addition, 
01364                   they may appear within the document type declaration at places allowed by the grammar. 
01365                   They are not part of the document's character data; an XML processor MAY, but need not, 
01366                   make it possible for an application to retrieve the text of comments. For compatibility, 
01367                   the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity 
01368                   references MUST NOT be recognized within comments.
01369 
01370                   An example of a comment:
01371 
01372                   <!-- declarations for <head> & <body> -->
01373     */
01374 
01375     value = "";
01376     // Keep all the white space.
01377     while ( p && *p && !StringEqual( p, endTag, false, encoding ) )
01378     {
01379         value.append( p, 1 );
01380         ++p;
01381     }
01382     if ( p ) 
01383         p += strlen( endTag );
01384 
01385     return p;
01386 }
01387 
01388 
01389 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01390 {
01391     p = SkipWhiteSpace( p, encoding );
01392     if ( !p || !*p ) return 0;
01393 
01394 //  int tabsize = 4;
01395 //  if ( document )
01396 //      tabsize = document->TabSize();
01397 
01398     if ( data )
01399     {
01400         data->Stamp( p, encoding );
01401         location = data->Cursor();
01402     }
01403     // Read the name, the '=' and the value.
01404     const char* pErr = p;
01405     p = ReadName( p, &name, encoding );
01406     if ( !p || !*p )
01407     {
01408         if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
01409         return 0;
01410     }
01411     p = SkipWhiteSpace( p, encoding );
01412     if ( !p || !*p || *p != '=' )
01413     {
01414         if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01415         return 0;
01416     }
01417 
01418     ++p;    // skip '='
01419     p = SkipWhiteSpace( p, encoding );
01420     if ( !p || !*p )
01421     {
01422         if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01423         return 0;
01424     }
01425     
01426     const char* end;
01427     const char SINGLE_QUOTE = '\'';
01428     const char DOUBLE_QUOTE = '\"';
01429 
01430     if ( *p == SINGLE_QUOTE )
01431     {
01432         ++p;
01433         end = "\'";     // single quote in string
01434         p = ReadText( p, &value, false, end, false, encoding );
01435     }
01436     else if ( *p == DOUBLE_QUOTE )
01437     {
01438         ++p;
01439         end = "\"";     // double quote in string
01440         p = ReadText( p, &value, false, end, false, encoding );
01441     }
01442     else
01443     {
01444         // All attribute values should be in single or double quotes.
01445         // But this is such a common error that the parser will try
01446         // its best, even without them.
01447         value = "";
01448         while (    p && *p                                          // existence
01449                 && !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r'  // whitespace
01450                 && *p != '/' && *p != '>' )                         // tag end
01451         {
01452             if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
01453                 // [ 1451649 ] Attribute values with trailing quotes not handled correctly
01454                 // We did not have an opening quote but seem to have a 
01455                 // closing one. Give up and throw an error.
01456                 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
01457                 return 0;
01458             }
01459             value += *p;
01460             ++p;
01461         }
01462     }
01463     return p;
01464 }
01465 
01466 #ifdef TIXML_USE_STL
01467 void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
01468 {
01469     while ( in->good() )
01470     {
01471         int c = in->peek(); 
01472         if ( !cdata && (c == '<' ) ) 
01473         {
01474             return;
01475         }
01476         if ( c <= 0 )
01477         {
01478             TiXmlDocument* document = GetDocument();
01479             if ( document )
01480                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01481             return;
01482         }
01483 
01484         (*tag) += (char) c;
01485         in->get();  // "commits" the peek made above
01486 
01487         if ( cdata && c == '>' && tag->size() >= 3 ) {
01488             size_t len = tag->size();
01489             if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
01490                 // terminator of cdata.
01491                 return;
01492             }
01493         }    
01494     }
01495 }
01496 #endif
01497 
01498 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
01499 {
01500     value = "";
01501     TiXmlDocument* document = GetDocument();
01502 
01503     if ( data )
01504     {
01505         data->Stamp( p, encoding );
01506         location = data->Cursor();
01507     }
01508 
01509     const char* const startTag = "<![CDATA[";
01510     const char* const endTag   = "]]>";
01511 
01512     if ( cdata || StringEqual( p, startTag, false, encoding ) )
01513     {
01514         cdata = true;
01515 
01516         if ( !StringEqual( p, startTag, false, encoding ) )
01517         {
01518             document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
01519             return 0;
01520         }
01521         p += strlen( startTag );
01522 
01523         // Keep all the white space, ignore the encoding, etc.
01524         while (    p && *p
01525                 && !StringEqual( p, endTag, false, encoding )
01526               )
01527         {
01528             value += *p;
01529             ++p;
01530         }
01531 
01532         TIXML_STRING dummy; 
01533         p = ReadText( p, &dummy, false, endTag, false, encoding );
01534         return p;
01535     }
01536     else
01537     {
01538         bool ignoreWhite = true;
01539 
01540         const char* end = "<";
01541         p = ReadText( p, &value, ignoreWhite, end, false, encoding );
01542         if ( p )
01543             return p-1; // don't truncate the '<'
01544         return 0;
01545     }
01546 }
01547 
01548 #ifdef TIXML_USE_STL
01549 void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
01550 {
01551     while ( in->good() )
01552     {
01553         int c = in->get();
01554         if ( c <= 0 )
01555         {
01556             TiXmlDocument* document = GetDocument();
01557             if ( document )
01558                 document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
01559             return;
01560         }
01561         (*tag) += (char) c;
01562 
01563         if ( c == '>' )
01564         {
01565             // All is well.
01566             return;
01567         }
01568     }
01569 }
01570 #endif
01571 
01572 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
01573 {
01574     p = SkipWhiteSpace( p, _encoding );
01575     // Find the beginning, find the end, and look for
01576     // the stuff in-between.
01577     TiXmlDocument* document = GetDocument();
01578     if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
01579     {
01580         if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
01581         return 0;
01582     }
01583     if ( data )
01584     {
01585         data->Stamp( p, _encoding );
01586         location = data->Cursor();
01587     }
01588     p += 5;
01589 
01590     version = "";
01591     encoding = "";
01592     standalone = "";
01593 
01594     while ( p && *p )
01595     {
01596         if ( *p == '>' )
01597         {
01598             ++p;
01599             return p;
01600         }
01601 
01602         p = SkipWhiteSpace( p, _encoding );
01603         if ( StringEqual( p, "version", true, _encoding ) )
01604         {
01605             TiXmlAttribute attrib;
01606             p = attrib.Parse( p, data, _encoding );     
01607             version = attrib.Value();
01608         }
01609         else if ( StringEqual( p, "encoding", true, _encoding ) )
01610         {
01611             TiXmlAttribute attrib;
01612             p = attrib.Parse( p, data, _encoding );     
01613             encoding = attrib.Value();
01614         }
01615         else if ( StringEqual( p, "standalone", true, _encoding ) )
01616         {
01617             TiXmlAttribute attrib;
01618             p = attrib.Parse( p, data, _encoding );     
01619             standalone = attrib.Value();
01620         }
01621         else
01622         {
01623             // Read over whatever it is.
01624             while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
01625                 ++p;
01626         }
01627     }
01628     return 0;
01629 }
01630 
01631 bool TiXmlText::Blank() const
01632 {
01633     for ( unsigned i=0; i<value.length(); i++ )
01634         if ( !IsWhiteSpace( value[i] ) )
01635             return false;
01636     return true;
01637 }
01638