TDCPARSE.CPP

//------------------------------------------------------------------------ 
//
// Tabular Data Control Parsing Module
// Copyright (C) Microsoft Corporation, 1996, 1997
//
// File: TDCParse.cpp
//
// Contents: Implementation of CTDCParse classes.
//
//------------------------------------------------------------------------


#include "stdafx.h"
#include "STD.h"
#include "TDC.h"
#include <MLang.h>
#include "Notify.h"
#include "TDCParse.h"
#include "TDCArr.h"
#include "locale.h"
#include "wch.h"

#define BYTE_ORDER_MARK 0xFEFF
#define REVERSE_BYTE_ORDER_MARK 0xFFFE

//------------------------------------------------------------------------
//
// Function: IsSpace()
//
// Synopsis: Returns TRUE if the given character is a space or tab character.
//
// Arguments: ch Character to test.
//
// Returns: TRUE if 'ch' is a space or tab character.
// FALSE otherwise.
//
//------------------------------------------------------------------------

inline boolean IsSpace(WCHAR ch)
{
return (ch == L' ' || ch == L'\t');
}

//////////////////////////////////////////////////////////////////////////
//
// CTDCTokenise Class - see comments in file TDCParse.h
// ------------------
//////////////////////////////////////////////////////////////////////////


//------------------------------------------------------------------------
//
// Method: CTDCTokenise::Create()
//
// Synopsis: Initialise the CTDCTokenise object
//
// Arguments: pFieldSink Object to send parsed fields to.
// wchDelimField \
// wchDelimRow | Set of characters that control
// wchQuote | the parsing of fields
// wchEscape /
//
// Returns: S_OK indicating success.
//
//------------------------------------------------------------------------

HRESULT CTDCUnify::InitTokenizer(CTDCFieldSink *pFieldSink, WCHAR wchDelimField,
WCHAR wchDelimRow, WCHAR wchQuote, WCHAR wchEscape)
{
_ASSERT(pFieldSink != NULL);
m_pFieldSink = pFieldSink;
m_wchDelimField = wchDelimField;
m_wchDelimRow = wchDelimRow;
m_wchQuote = wchQuote;
m_wchEscape = wchEscape;
m_ucParsed = 0;

m_fIgnoreNextLF = FALSE;
m_fIgnoreNextCR = FALSE;
m_fIgnoreNextWhiteSpace = FALSE;
m_fEscapeActive = FALSE;
m_fQuoteActive = FALSE;
m_fFoldWhiteSpace = FALSE;

// Ensure that the field and row delimiters are set.
//
if (m_wchDelimRow == 0)
m_wchDelimRow = DEFAULT_ROW_DELIM[0];

// Remove conflicting delimiter values
//
if (m_wchDelimRow == m_wchDelimField)
m_wchDelimRow = 0;
if (m_wchQuote != 0)
{
if (m_wchQuote == m_wchDelimField || m_wchQuote == m_wchDelimRow)
m_wchQuote = 0;
}
if (m_wchEscape != 0)
{
if (m_wchEscape == m_wchDelimField ||
m_wchEscape == m_wchDelimRow ||
m_wchEscape == m_wchQuote)
m_wchEscape = 0;
}

m_fFoldCRLF = (m_wchDelimRow == L'\r' || m_wchDelimRow == L'\n');

return S_OK;
}

//------------------------------------------------------------------------
//
// Method: CTDCTokenise::AddWcharBuffer()
//
// Synopsis: Takes a buffer of characters, breaks it up into fields
// and passes them to the embedded CTDCFieldSink object
// as fields.
//
// Arguments: pwch Buffer containing characters to be parsed.
// dwSize Number of significant characters in 'pwch'
// dwSize == 0 means "End-of-stream"
//
// Returns: S_OK upon success.
// E_OUTOFMEMORY indicating insufficient memory to carry
// out the parse operation.
// Other misc error code upon failure.
//
//------------------------------------------------------------------------

HRESULT CTDCUnify::AddWcharBuffer(BOOL fLastData)
{

OutputDebugStringX(_T("CTDCTokenise::AddWcharBuffer called\n"));

_ASSERT(m_pFieldSink != NULL);

HRESULT hr = S_OK;

LPWCH pwchCurr; // Next character to process
LPWCH pwchEnd; // End-of-buffer marker
LPWCH pwchDest; // Where to write next char processed
LPWCH pwchStart; // Beginning of current token

pwchStart = &m_psWcharBuf[0];
pwchCurr = pwchStart + m_ucParsed;
pwchDest = pwchCurr;
pwchEnd = &m_psWcharBuf[m_ucWcharBufCount];

// Read up to the next field boundary (field or row delimiter)
//
while (pwchCurr < pwchEnd)
{
if (m_fIgnoreNextLF)
{
// We're expecting a LF to terminate a CR-LF sequence.
//
m_fIgnoreNextLF = FALSE;
if (*pwchCurr == L'\n')
{
// Found a LF - ignore it
//
pwchCurr++;
continue;
}

// Found something else - carry on ...
//
}

if (m_fIgnoreNextCR)
{
// We're expecting a CR to terminate a LF-CR sequence.
//
m_fIgnoreNextCR = FALSE;
if (*pwchCurr == L'\r')
{
// Found a CR - ignore it
//
pwchCurr++;
continue;
}

// Found something else - carry on ...
//
}

if (m_fIgnoreNextWhiteSpace)
{
// We're expecting the rest of a white-space sequence
//
if (IsSpace(*pwchCurr))
{
// Found white-space - ignore it
//
pwchCurr++;
continue;
}
m_fIgnoreNextWhiteSpace = FALSE;
}

// Escape characters work, even in quoted strings
//
if (m_fEscapeActive)
{
*pwchDest++ = *pwchCurr++;
m_fEscapeActive = FALSE;
continue;
}
if (*pwchCurr == m_wchEscape)
{
pwchCurr++;
m_fEscapeActive = TRUE;
continue;
}

// Quotes activate/deactivate Field/Row delimiters
//
if (*pwchCurr == m_wchQuote)
{
pwchCurr++;
m_fQuoteActive = !m_fQuoteActive;
continue;
}

if (m_fQuoteActive)
{
*pwchDest++ = *pwchCurr++;
continue;
}


if (*pwchCurr == m_wchDelimField ||
(m_fFoldWhiteSpace && IsSpace(*pwchCurr)))
{
hr = m_pFieldSink->AddField(pwchStart, pwchDest - pwchStart);
if (!SUCCEEDED(hr))
goto Cleanup;
pwchCurr++;
if (m_fFoldWhiteSpace && IsSpace(*pwchCurr))
m_fIgnoreNextWhiteSpace = TRUE;
pwchStart = &m_psWcharBuf[0];
pwchDest = pwchStart;
continue;
}

if (*pwchCurr == m_wchDelimRow ||
(m_fFoldCRLF && (*pwchCurr == L'\r' || *pwchCurr == L'\n')))
{
hr = m_pFieldSink->AddField(pwchStart, pwchDest - pwchStart);
if (!SUCCEEDED(hr))
goto Cleanup;
hr = m_pFieldSink->EOLN();
if (!SUCCEEDED(hr))
goto Cleanup;
if (m_fFoldCRLF)
{
m_fIgnoreNextLF = (*pwchCurr == L'\r');
m_fIgnoreNextCR = (*pwchCurr == L'\n');
}
pwchCurr++;
pwchStart = &m_psWcharBuf[0];
pwchDest = pwchStart;
continue;
}

*pwchDest++ = *pwchCurr++;
}

m_ucWcharBufCount = pwchDest - pwchStart;
m_ucParsed = pwchDest - pwchStart; // amount we've already parsed

// If this is the last data packet, and there's a fragment left,
// parse it.
if (m_ucWcharBufCount && fLastData)
{
hr = m_pFieldSink->AddField(pwchStart, m_ucParsed);
if (!SUCCEEDED(hr))
goto Cleanup;
m_ucParsed = 0;
hr = m_pFieldSink->EOLN();
return hr;
}


Cleanup:
return hr;
}




//////////////////////////////////////////////////////////////////////////
//
// CTDCUnify Class - see comments in file TDCParse.h
// ---------------
//////////////////////////////////////////////////////////////////////////

//------------------------------------------------------------------------
//
// Method: CTDCUnify::CTDCUnify()
//
// Synopsis: Constuctor
//
//------------------------------------------------------------------------

CTDCUnify::CTDCUnify()
{
m_pML = NULL;
}

//------------------------------------------------------------------------
//
// Method: CTDCUnify::~CTDCUnify()
//
// Synopsis: Destructor
//
//------------------------------------------------------------------------

CTDCUnify::~CTDCUnify()
{
delete [] m_psByteBuf;
delete [] m_psWcharBuf;

if (m_pML != NULL)
m_pML->Release();
}

//------------------------------------------------------------------------
//
// Method: CTDCUnify::Create()
//
// Synopsis: Initialise the CTDCUnify object
//
// Arguments: pTokenise Object to send converted buffers to.
// nCodePage Code page for ASCII->Unicode conversions
// pML MLANG COM object (used for conversions)
//
// Returns: S_OK to indicate success.
//
//------------------------------------------------------------------------

HRESULT CTDCUnify::Create(UINT nCodePage, IMultiLanguage *pML)
{
m_pML = pML;
m_pML->AddRef();
m_nCodePage = nCodePage;
m_fDataMarkedUnicode = FALSE;
m_fDataIsUnicode = FALSE;
m_dwBytesProcessed = 0;
m_fCanConvertToUnicode = 0;
m_nUnicode = 0;

m_dwConvertMode = 0;
m_ucByteBufSize = 0;
m_ucByteBufCount = 0;
m_psByteBuf = NULL;

m_ucWcharBufSize = 0;
m_ucWcharBufCount = 0;
m_psWcharBuf = NULL;

if (m_pML != NULL)
{
HRESULT hr;

hr = m_pML->IsConvertible(m_nCodePage, UNICODE_CP);
m_fCanConvertToUnicode = SUCCEEDED(hr);
}
return S_OK;
}

//------------------------------------------------------------------------
//
// Method: CTDCUnify::IsUnicode
//
// Synopsis: Determines if our text buffer is Unicode or not. Should
// only be called once on the FIRST text buffer.
//
// Assume if the data is marked as Unicode, that it's correct.
//
// The determination this routine makes will override any
// single byte codepage the user may have specified.
//
//
// Arguments: pBytes Buffer containing characters to be converted.
// dwSize Number of significant characters in 'pBytes'
//
// Returns: Code page of text, or zero if not Unicode (UNICODE_CP,
// UNICODE_REVERSE_CP, or 0)
//
//
//------------------------------------------------------------------------
int
CTDCUnify::IsUnicode(BYTE * pBytes, DWORD dwSize)
{
// Document.Write can cause our hosting page to be Unicode, when the TDC
// data page might not be. This can cause some real problems..
if (UNICODE_CP == m_nCodePage ||
UNICODE_REVERSE_CP == m_nCodePage)
{
return m_nCodePage;
}

if (BYTE_ORDER_MARK == *(WCHAR *)pBytes)
return UNICODE_CP;

if (UNICODE_REVERSE_CP == *(WCHAR *)pBytes)
return UNICODE_REVERSE_CP;

else return 0;
}

//------------------------------------------------------------------------
//
// Method: CTDCUnify::ConvertByteBuffer()
//
// Synopsis: Converts a byte-buffer into a wide-character stream
// (applying unicode conversions if necessary) and passes
// it to the embedded TDCTokenise object to be broken into
// fields.
//
// Arguments: pBytes Buffer containing characters to be converted.
// dwSize Number of significant characters in 'pBytes'
// dwSize == 0 means "End-of-stream"
//
// Returns: S_OK upon success.
// OLE_E_CANTCONVERT if a non-unicode buffer can't be
// converted into unicode.
// E_OUTOFMEMORY if there isn't enough memory to perform
// a data conversion.
//
//------------------------------------------------------------------------

HRESULT CTDCUnify::ConvertByteBuffer(BYTE *pBytes, DWORD dwSize)
{
OutputDebugStringX(_T("CTDCUnify::AddByteBuffer called\n"));

_ASSERT(pBytes != NULL || dwSize == 0);

// Examine the data to determine if it's in UNICODE format or not.
// Convert to UNICODE if neccessary.
//
HRESULT hr = S_OK;
UINT ucBytes;
UINT ucWchars;

_ASSERT(pBytes != NULL);
_ASSERT(dwSize > 0);

if (!m_fCanConvertToUnicode)
hr = OLE_E_CANTCONVERT;

// Is there enough space in Byte buffer for this packet?
if (dwSize > (m_ucByteBufSize - m_ucByteBufCount))
{
// No, the current buffer is too small, make a new one.
BYTE * psTemp = new BYTE[m_ucByteBufCount + dwSize];
if (psTemp==NULL)
{
hr = E_OUTOFMEMORY;

goto Done;
}

if (m_psByteBuf != NULL) // if not first time
{
memmove(psTemp, m_psByteBuf, m_ucByteBufCount);
delete [] m_psByteBuf;
}
m_ucByteBufSize = m_ucByteBufCount + dwSize;
m_psByteBuf = psTemp;
}

// Append the new data to the old data.
memmove(m_psByteBuf + m_ucByteBufCount, pBytes, dwSize);
m_ucByteBufCount += dwSize;

// Is there enough space in the Wchar buffer for the converted data?
// We make a very conservative assumption here that N source buffer bytes
// convert to N Wchar buffer chars (or 2*N bytes). This will ensure that
// our call to ConvertToUnicode will never not finish because there wasn't
// enough room in the output buffer.
if (dwSize > (m_ucWcharBufSize - m_ucWcharBufCount))
{
// The current buffer is too small, make a new one.
WCHAR * psTemp = new WCHAR[m_ucWcharBufCount + dwSize];
if (psTemp==NULL)
{
hr = E_OUTOFMEMORY;
goto Done;
}

if (m_psWcharBuf != NULL) // if not first time
{
memmove(psTemp, m_psWcharBuf,
m_ucWcharBufCount*sizeof(WCHAR));
delete [] m_psWcharBuf;
}
m_psWcharBuf = psTemp;
m_ucWcharBufSize = m_ucWcharBufCount + dwSize;
}

if (0 == m_dwBytesProcessed)
{
// Need at least 2 chars for Unicode signature (0xFFFE or 0xFEFF)
if (m_ucByteBufCount > 1)
{
// If we detect Unicode, it overrides any user specified code page.
m_nUnicode = IsUnicode(m_psByteBuf, m_ucByteBufCount);
m_nCodePage = m_nUnicode ? m_nUnicode : m_nCodePage;
}
else
goto Done; // too little to detect anything yet
}

// Convert as many source bytes as we can to Unicode chars
ucBytes = m_ucByteBufCount;
ucWchars = m_ucWcharBufSize - m_ucWcharBufCount;

// ConvertStringToUnicode won't convert Unicode to Unicode for us.
// So we'll do it ourselves.
if (m_nUnicode)
{
_ASSERT( ucWchars * sizeof(WCHAR) >= ucBytes);

// This might copy an odd extra byte
memmove((BYTE *)(m_psWcharBuf + m_ucWcharBufCount), m_psByteBuf,
ucBytes);

// But we only count the number of complete WCHAR's we copied.
ucWchars = ucBytes / sizeof(WCHAR);
ucBytes = ucWchars * sizeof(WCHAR);

if (UNICODE_REVERSE_CP == m_nUnicode)
{
// need to byte swap
BYTE *pByteSwap = (BYTE *)(m_psWcharBuf + m_ucWcharBufCount);
BYTE bTemp;
for (ULONG i = ucWchars; i != 0; i--)
{
// Well, OK, we've kind of hardwired WCHAR == 2 here, but ..
bTemp = pByteSwap[0];
pByteSwap[0] = pByteSwap[1];
pByteSwap[1] = bTemp;
pByteSwap += 2;
}
}

// On first packet, need to remove Unicode signature.
// Only need to look for 0xFFFE -- we already swapped bytes.
if (0 == m_dwBytesProcessed && m_psWcharBuf[0] == BYTE_ORDER_MARK)
{
ucWchars--;
memmove((BYTE *)m_psWcharBuf, (BYTE *)m_psWcharBuf+2,
ucWchars*sizeof(ucWchars));
}
}
else
{
hr = m_pML->ConvertStringToUnicode(&m_dwConvertMode, m_nCodePage,
(char *)m_psByteBuf, &ucBytes,
m_psWcharBuf +m_ucWcharBufCount,
&ucWchars);
}
// Move any leftover source characters to the start of the buffer.
// These are probably split Unicode chars, lead bytes without trail
// bytes, etc.
m_ucByteBufCount -= ucBytes;
memmove(m_psByteBuf, m_psByteBuf + ucBytes,
m_ucByteBufCount);

// The number of useful chars in the output buf is increased by the
// number we managed to convert.
m_ucWcharBufCount += ucWchars;
m_dwBytesProcessed += ucWchars;

Done:
return hr;
}

LPWCH SkipSpace(LPWCH pwchCurr)
{
while (IsSpace(*pwchCurr)) pwchCurr++;
return pwchCurr;
}

static
boolean IsEnd(WCHAR ch)
{
return (ch == 0 || ch == L'\r' || ch == L'\n');
}

static
boolean IsBreak(WCHAR ch)
{
return (ch == L';' || IsEnd(ch));
}

// Returns FALSE if names didn't match.
// Returns TRUE if they did.
// Sets *ppwchAdvance to terminator of the match name
BOOL
MatchName(LPWCH pwchMatchName, LPCWCH pwzHostName, LPWCH *ppwchAdvance)
{
// match from right to left
LPWCH pwchMatchRight = &pwchMatchName[0];
LPCWCH pwchHostRight = &pwzHostName[0] + ocslen(pwzHostName) -1;

// handle empty match name
if (IsBreak(*pwchMatchRight))
{
if (!IsEnd(*pwchMatchRight)) // be sure to advance (unless at end)
++ pwchMatchRight;
*ppwchAdvance = pwchMatchRight;
return FALSE;
}

// Find end of Match name.
while (!IsBreak(*pwchMatchRight)) pwchMatchRight++;

*ppwchAdvance = pwchMatchRight; // return pointer to terminator

pwchMatchRight--;

while (IsSpace(*pwchMatchRight) && pwchMatchRight >= pwchMatchName)
-- pwchMatchRight; // ignore trailing whitespace

// match full wildcard the easy way
if (pwchMatchRight == pwchMatchName && pwchMatchRight[0] == '*')
return TRUE;

// match right-to-left, stop at mismatch or beginning of either string
for (; pwchMatchRight>=pwchMatchName && pwchHostRight>=pwzHostName;
--pwchMatchRight, --pwchHostRight)
{
if (*pwchMatchRight != *pwchHostRight || *pwchMatchRight == '*')
break;
}

// it's a match if strings matched completely
if (pwchMatchRight+1 == pwchMatchName && pwchHostRight+1 == pwzHostName)
return TRUE;

// or if match name started with "*." and the rest matched a suffix of host name
if (pwchMatchRight == pwchMatchName && pwchMatchRight[0] == '*' &&
pwchMatchRight[1] == '.')
return TRUE;

// otherwise it's not a match
return FALSE;
}

HRESULT
CTDCUnify::MatchAllowDomainList(LPCWSTR pwzURL)
{
HRESULT hr = E_FAIL; // assume failure
LPWCH pwchCurr = &m_psWcharBuf[0];
LPWCH pwchCurr2;
int cchHostDoman = ocslen(pwzURL);

// skip over white space
pwchCurr = SkipSpace(pwchCurr);
if (IsEnd(*pwchCurr))
goto Cleanup;

// must have the equal sign
if (*pwchCurr++ != '=' || *pwchCurr == '\0')
goto Cleanup;

while (TRUE)
{
// skip over white space
pwchCurr = SkipSpace(pwchCurr);

if (IsEnd(*pwchCurr)) // terminate on \r, \n, \0
break;

if (IsBreak(*pwchCurr)) // Must be ';',
pwchCurr++; // skip it.

// skip over white space
pwchCurr = SkipSpace(pwchCurr);

if (MatchName(pwchCurr, pwzURL, &pwchCurr2))
{
hr = S_OK;
break;
}
pwchCurr = pwchCurr2;
}

Cleanup:
while (!IsEnd(*pwchCurr))
pwchCurr++;

// Skip CRLF combos
if (*pwchCurr == '\r' && pwchCurr[1] == '\n') pwchCurr++;

// Eat the AllowDomain line so it doesn't screw up the data.
m_ucWcharBufCount -= (pwchCurr+1 - m_psWcharBuf);
memmove(m_psWcharBuf, pwchCurr+1, m_ucWcharBufCount*sizeof(WCHAR));

return hr;
}

//------------------------------------------------------------------------
//
// Method: CTDCUnify::CheckForAllowDomainList
//
// Synopsis: Checks the beggining of the Wide Char buffer to see if it
// contains the string "@!allow.domains". This is used to
// determine if this file has a list of domain names which are
// allowed to access this file, even though the access may be
// coming from another internet host.
//
// Arguments: uses CTDCUnify state variables for the Wide Char buffer:
// m_psWcharBUf the Wide char buffer
// m_ucWcharBufCount the # of chars in the wide char buf
//
// Returns: ALLOW_DOMAINLIST_NO signature not found
// ALLOW_DOMAINLIST_YES signature was found
// ALLOW_DOMAINLIST_DONTKNOW don't have enough characters
// to know for sure yet.
//
//------------------------------------------------------------------------

CTDCUnify::ALLOWDOMAINLIST
CTDCUnify::CheckForAllowDomainList()
{
ULONG cAllowDomainLen = ocslen(ALLOW_DOMAIN_STRING);

// Make sure we have a while line.
LPWCH pwchCurr = m_psWcharBuf;
while (!IsEnd(*pwchCurr)) pwchCurr++;
if (*pwchCurr == '\0') // if buffer ended before line did
return ALLOW_DOMAINLIST_DONTKNOW;

if (0 == wch_incmp(m_psWcharBuf, ALLOW_DOMAIN_STRING, cAllowDomainLen))
{
// We matched equal and have the whole string.
// Take the "@!allow.domains" out of the buffer..
m_ucWcharBufCount -= cAllowDomainLen;
memmove(m_psWcharBuf, &m_psWcharBuf[cAllowDomainLen],
m_ucWcharBufCount*sizeof(WCHAR));
return ALLOW_DOMAINLIST_YES;
}

// We didn't match equal, no point in looking any more.
return ALLOW_DOMAINLIST_NO;
}