HTMLSCAN.CXX

//+--------------------------------------------------------------------------- 
//
// Copyright 1992 - 1998 Microsoft Corporation.
//
// File: htmlscan.cxx
//
// Contents: Scanner for html files
//
// Classes: CHtmlScanner
//
//----------------------------------------------------------------------------

#include <pch.cxx>
#pragma hdrstop

#include <htmlguid.hxx>
#include <charhash.hxx>
#include <htmlfilt.hxx>

//+-------------------------------------------------------------------------
//
// Method: CToken::IsMatchProperty
//
// Synopsis: Does the token's property match the given property ?
//
// Arguments: [propSpec] -- Property to match
//
//--------------------------------------------------------------------------

BOOL CToken::IsMatchProperty( CFullPropSpec& propSpec )
{
if ( propSpec.IsPropertyPropid()
&& propSpec.GetPropSet() == _guidPropset
&& propSpec.GetPropertyPropid() == _propid )
{
return TRUE;
}
else
return FALSE;
}


//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::CHtmlScanner
//
// Synopsis: Constructor
//
// Arguments: [htmlIFilter] -- Reference to Html filter
// [serialStream] -- Reference to input stream to scan
//
//--------------------------------------------------------------------------

CHtmlScanner::CHtmlScanner( CHtmlIFilter& htmlIFilter,
CSerialStream& serialStream )
: _htmlIFilter(htmlIFilter),
_serialStream(serialStream),
_uLenTagBuf(TAG_BUFFER_SIZE),
_cTagCharsRead(0)
{
_pwcTagBuf = newk(mtNewX, NULL) WCHAR[ TAG_BUFFER_SIZE ];
}


//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::~CHtmlScanner
//
// Synopsis: Destructor
//
//--------------------------------------------------------------------------

CHtmlScanner::~CHtmlScanner()
{
delete[] _pwcTagBuf;
}



//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::GetBlockOfChars
//
// Synopsis: Returns a block of chars upto the size requested by user. If
// any Html tag is encountered, it stops scanning, and returns the
// token found.
//
// Arguments: [cCharsNeeded] -- Maximum # chars to scan
// [awcBuffer] -- Buffer to fill with scanned chars
// [cCharsScanned] -- # chars actually scanned
// [token] -- Token found (if any)
//
//--------------------------------------------------------------------------

void CHtmlScanner::GetBlockOfChars( ULONG cCharsNeeded,
WCHAR *awcBuffer,
ULONG& cCharsScanned,
CToken& token )
{
cCharsScanned = 0;

while ( cCharsNeeded > 0 )
{
if ( _serialStream.Eof() )
{
token.SetTokenType( EofToken );
return;
}

WCHAR wch = _serialStream.GetChar();
if ( wch == L'<' )
{
//
// Html tag encountered
//
ScanTag( token );
return;
}
else
{
//
// &lt; and &gt; were mapped to Unicode chars from private use area
// to avoid collision with '<' and '>' chars in Html tags. Map them
// back to '<' and '>'.
//
if ( wch == PRIVATE_USE_MAPPING_FOR_LT )
wch = L'<';
else if ( wch == PRIVATE_USE_MAPPING_FOR_GT )
wch = L'>';

awcBuffer[cCharsScanned++] = wch;
cCharsNeeded--;
}
}
}



//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::SkipCharsUntilNextRelevantToken
//
// Synopsis: Skips characters in input until EOF or an interesting token
// is found. The list of properties that were asked to be filtered
// as part of the IFilter::Init call determines whether a token is
// interesting or not.
//
// Arguments: [fFilterContents] -- Are contents filtered ?
// [fFilterProperties] -- Are properties filtered ?
// [cAttributes] -- Count of properties
// [pAttributes] -- List of properties to be filtered
//
//--------------------------------------------------------------------------

void CHtmlScanner::SkipCharsUntilNextRelevantToken( CToken& token )
{
//
// Loop until we find a stop token or end of file
//
for (;;)
{
if ( _serialStream.Eof() )
{
token.SetTokenType( EofToken );
return;
}

WCHAR wch = _serialStream.GetChar();
if ( wch == L'<' )
{
ScanTag( token );

if ( token.GetTokenType() == EofToken
|| _htmlIFilter.IsStopToken( token ) )
{
return;
}
else
{
//
// Uninteresting tag, hence skip tag
//
EatTag();
}
}
else
{
//
// Vanilla text
//
if ( _htmlIFilter.FFilterContent() )
{
_serialStream.UnGetChar( wch );
token.SetTokenType( TextToken );
return;
}
else
EatText();
}
}
}


//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::ScanTag
//
// Synopsis: Scans a Html tag from input
//
// Arguments: [token] -- Token info returned here
//
//--------------------------------------------------------------------------

void CHtmlScanner::ScanTag( CToken& token )
{

EatBlanks();

if ( _serialStream.Eof() )
{
token.SetTokenType( EofToken );
return;
}
WCHAR wch = _serialStream.GetChar();

token.SetStartTokenFlag( TRUE );
if ( wch == L'/' )
{
//
// This is an end tag
//
token.SetStartTokenFlag( FALSE );
EatBlanks();

if ( _serialStream.Eof() )
{
token.SetTokenType( EofToken );
return;
}
wch = _serialStream.GetChar();
}

WCHAR awcTagName[MAX_TAG_LENGTH+1];
unsigned uLenTag = 0;

//
// Scan the tag name into szTagName. We scan MAX_TAG_LENGTH
// characters only, because anything longer is most probably
// a bogus tag.
//
while ( !iswspace(wch)
&& wch != L'>'
&& uLenTag < MAX_TAG_LENGTH )
{
awcTagName[uLenTag++] = wch;

if ( _serialStream.Eof() )
break;
wch = _serialStream.GetChar();
}
awcTagName[uLenTag] = 0;

if ( _serialStream.Eof() )
{
token.SetTokenType( EofToken );
return;
}
else if ( wch == L'>' || uLenTag == MAX_TAG_LENGTH )
{
//
// Push char back into input stream because a subsequent GetChar()
// will be expecting to see the char in the input
//
_serialStream.UnGetChar( wch );
}

TagNameToToken( awcTagName, token );
}




//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::ReadTagIntoBuffer
//
// Synopsis: Reads the rest of Html tag into the internal buffer
//
//--------------------------------------------------------------------------

void CHtmlScanner::ReadTagIntoBuffer()
{
_cTagCharsRead = 0;

if ( _serialStream.Eof() )
return;

WCHAR wch = _serialStream.GetChar();
while ( wch != L'>' )
{
if ( _cTagCharsRead >= _uLenTagBuf )
GrowTagBuffer();
Win4Assert( _cTagCharsRead < _uLenTagBuf );

_pwcTagBuf[_cTagCharsRead++] = wch;

if ( _serialStream.Eof() )
return;
wch = _serialStream.GetChar();
}
}




//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::ScanTagBuffer
//
// Synopsis: Scans the internal tag buffer for a given name, and returns the
// corresponding value
//
// Arguments: [awcName] -- Pattern to match
// [pwcValue] -- Start position of value returned here
// [uLenValue -- Length of value field
//
//--------------------------------------------------------------------------

void CHtmlScanner::ScanTagBuffer( WCHAR *awcName,
WCHAR * & pwcValue,
unsigned& uLenValue )
{
unsigned uLenName = wcslen( awcName );

if ( _cTagCharsRead <= uLenName )
{
//
// Pattern to match is longer than scanned tag
//
pwcValue = 0;
uLenValue = 0;

return;
}

for ( unsigned i=0; i<_cTagCharsRead-uLenName; i++ )
{
BOOL fMatch = TRUE;
for ( unsigned j=0; j<uLenName; j++ )
{
//
// Case insensitive match
//
if ( towlower(awcName[j]) != towlower(_pwcTagBuf[i+j]) )
{
fMatch = FALSE;
break;
}
}

if ( fMatch )
{
unsigned k = i + uLenName;
while ( _pwcTagBuf[k] != L'"' && k < _cTagCharsRead )
k++;

uLenValue = k - (i + uLenName);
pwcValue = &_pwcTagBuf[i+uLenName];

return;
}
}

uLenValue = 0;
pwcValue = 0;
}



//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::EatTag
//
// Synopsis: Skips characters in input until the '>' char, which demarcates
// the end of the tag
//
//--------------------------------------------------------------------------

void CHtmlScanner::EatTag()
{
if ( _serialStream.Eof() )
return;

WCHAR wch = _serialStream.GetChar();
while ( wch != L'>' && !_serialStream.Eof() )
wch = _serialStream.GetChar();
}



//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::EatText
//
// Synopsis: Skips characters in input until a '<', ie a tag is encountered
//
//--------------------------------------------------------------------------

void CHtmlScanner::EatText()
{
if ( _serialStream.Eof() )
return;

WCHAR wch = _serialStream.GetChar();
while ( wch != L'<' && !_serialStream.Eof() )
wch = _serialStream.GetChar();

if ( wch == L'<' )
_serialStream.UnGetChar( wch );
}



//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::EatBlanks
//
// Synopsis: Skips generic white space characters in input
//
//--------------------------------------------------------------------------

void CHtmlScanner::EatBlanks()
{
if ( _serialStream.Eof() )
return;

WCHAR wch = _serialStream.GetChar();
while ( iswspace(wch) && !_serialStream.Eof() )
wch = _serialStream.GetChar();

if ( !iswspace(wch) )
_serialStream.UnGetChar( wch );
}




//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::TagNameToToken
//
// Synopsis: Maps a tag name to token information
//
// Arguments: [awcTagName] -- Tag name to map
// [token] -- Token information returned here
//
//--------------------------------------------------------------------------

void CHtmlScanner::TagNameToToken( WCHAR *awcTagName, CToken& token )
{
//
// The number of interesting Html tags will be small, hence no need for
// a table lookup
//
switch( awcTagName[0] )
{
case L'a':
case L'A':
if ( _wcsicmp( awcTagName, L"a" ) == 0 )
{
token.SetTokenType( AnchorToken );
token.SetPropset( CLSID_HtmlInformation );
token.SetPropid( PID_HREF );
}
else if ( _wcsicmp( awcTagName, L"address" ) == 0 )
token.SetTokenType( BreakToken );
else
token.SetTokenType( GenericToken );

break;

case L'b':
case L'B':
if ( _wcsicmp( awcTagName, L"br" ) == 0
|| _wcsicmp( awcTagName, L"blockquote" ) == 0 )
{
token.SetTokenType( BreakToken );
}
else
token.SetTokenType( GenericToken );

break;

case L'd':
case L'D':
if ( _wcsicmp( awcTagName, L"dd" ) == 0
|| _wcsicmp( awcTagName, L"dl" ) == 0
|| _wcsicmp( awcTagName, L"dt" ) == 0 )
{
token.SetTokenType( BreakToken );
}
else
token.SetTokenType( GenericToken );

break;

case L'f':
case L'F':
if ( _wcsicmp( awcTagName, L"form" ) == 0 )
token.SetTokenType( BreakToken );
else
token.SetTokenType( GenericToken );

break;

case L'h':
case L'H':
if ( _wcsicmp( awcTagName, L"h1" ) == 0 )
{
token.SetTokenType( Heading1Token );
token.SetPropset( CLSID_HtmlInformation );
token.SetPropid( PID_HEADING_1 );
}
else if ( _wcsicmp( awcTagName, L"h2" ) == 0 )
{
token.SetTokenType( Heading2Token );
token.SetPropset( CLSID_HtmlInformation );
token.SetPropid( PID_HEADING_2 );
}
else if ( _wcsicmp( awcTagName, L"h3" ) == 0 )
{
token.SetTokenType( Heading3Token );
token.SetPropset( CLSID_HtmlInformation );
token.SetPropid( PID_HEADING_3 );
}
else if ( _wcsicmp( awcTagName, L"h4" ) == 0 )
{
token.SetTokenType( Heading4Token );
token.SetPropset( CLSID_HtmlInformation );
token.SetPropid( PID_HEADING_4 );
}
else if ( _wcsicmp( awcTagName, L"h5" ) == 0 )
{
token.SetTokenType( Heading5Token );
token.SetPropset( CLSID_HtmlInformation );
token.SetPropid( PID_HEADING_5 );
}
else if ( _wcsicmp( awcTagName, L"h6" ) == 0 )
{
token.SetTokenType( Heading6Token );
token.SetPropset( CLSID_HtmlInformation );
token.SetPropid( PID_HEADING_6 );
}
else
token.SetTokenType( GenericToken );

break;

case L'i':
case L'I':
if ( _wcsicmp( awcTagName, L"input" ) == 0 )
token.SetTokenType( InputToken );
else if ( _wcsicmp( awcTagName, L"img" ) == 0 )
token.SetTokenType( ImageToken );
else
token.SetTokenType( GenericToken );

break;

case L'l':
case L'L':
if ( _wcsicmp( awcTagName, L"li" ) == 0 )
token.SetTokenType( BreakToken );
else
token.SetTokenType( GenericToken );

break;

case L'm':
case L'M':
if ( _wcsicmp( awcTagName, L"math" ) == 0 )
token.SetTokenType( BreakToken );
else if ( _wcsicmp( awcTagName, L"meta" ) == 0 )
token.SetTokenType( MetaToken );
else
token.SetTokenType( GenericToken );

break;

case L'o':
case L'O':
if ( _wcsicmp( awcTagName, L"ol" ) == 0 )
token.SetTokenType( BreakToken );
else
token.SetTokenType( GenericToken );

break;

case L'p':
case L'P':
if ( _wcsicmp( awcTagName, L"p" ) == 0 )
token.SetTokenType( BreakToken );
else
token.SetTokenType( GenericToken );

break;

case L's':
case L'S':
if ( _wcsicmp( awcTagName, L"script" ) == 0 )
token.SetTokenType( ScriptToken );
else
token.SetTokenType( GenericToken );

break;

case L't':
case L'T':
if ( _wcsicmp( awcTagName, L"title" ) == 0 )
{
token.SetTokenType( TitleToken );
token.SetPropset( CLSID_SummaryInformation );
token.SetPropid( PID_TITLE );
}
else if ( _wcsicmp( awcTagName, L"table" ) == 0
|| _wcsicmp( awcTagName, L"th" ) == 0
|| _wcsicmp( awcTagName, L"tr" ) == 0
|| _wcsicmp( awcTagName, L"td" ) == 0 )
token.SetTokenType( BreakToken );
else
token.SetTokenType( GenericToken );

break;

case L'u':
case L'U':
if ( _wcsicmp( awcTagName, L"ul" ) == 0 )
token.SetTokenType( BreakToken );
else
token.SetTokenType( GenericToken );

break;

case L'!':
if ( _wcsicmp( awcTagName, L"!--" ) == 0 )
token.SetTokenType( CommentToken );
else
token.SetTokenType( GenericToken );

break;

default:
//
// It's an uninteresting tag
//
token.SetTokenType( GenericToken );
}
}




//+-------------------------------------------------------------------------
//
// Method: CHtmlScanner::GrowTagBuffer
//
// Synopsis: Grow internal tag buffer to twice its current size
//
//--------------------------------------------------------------------------

void CHtmlScanner::GrowTagBuffer()
{
WCHAR *pwcNewTagBuf = newk(mtNewX, NULL) WCHAR[2 * _uLenTagBuf];
RtlCopyMemory( pwcNewTagBuf,
_pwcTagBuf,
_uLenTagBuf * sizeof(WCHAR) );

delete[] _pwcTagBuf;
_uLenTagBuf *= 2;
_pwcTagBuf = pwcNewTagBuf;
}