328 lines
12 KiB
C++
Executable File
328 lines
12 KiB
C++
Executable File
/** \file
|
|
* Defines the basic structures used to manipulate character
|
|
* streams from any input source. Any character size and encoding
|
|
* can in theory be used, so long as a set of functinos is provided that
|
|
* can return a 32 bit Integer representation of their characters amd efficiently mark and revert
|
|
* to specific offsets into their input streams.
|
|
*/
|
|
#ifndef _ANTLR_INPUT_HPP
|
|
#define _ANTLR_INPUT_HPP
|
|
|
|
// [The "BSD licence"]
|
|
// Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
|
|
|
|
//
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions
|
|
// are met:
|
|
// 1. Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// 2. Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
// 3. The name of the author may not be used to endorse or promote products
|
|
// derived from this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
#include "antlr3defs.hpp"
|
|
|
|
ANTLR_BEGIN_NAMESPACE()
|
|
|
|
/// Master context structure for an ANTLR3 C runtime based input stream.
|
|
/// \ingroup apistructures. Calling _LT on this doesn't seem right. You would
|
|
/// call it only with parser / TreeParser, and their respective input streams
|
|
/// has that function. calling it from lexer will throw a compile time error
|
|
///
|
|
|
|
template<class ImplTraits>
|
|
class InputStream : public ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType >
|
|
{
|
|
public:
|
|
typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
|
|
typedef typename ImplTraits::LexStateType LexStateType;
|
|
typedef typename ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType > IntStreamType;
|
|
typedef IntStreamType BaseType;
|
|
typedef typename ImplTraits::StreamDataType UnitType;
|
|
typedef UnitType DataType;
|
|
typedef UnitType TokenType;
|
|
typedef typename AllocPolicyType::template VectorType<LexStateType> MarkersType;
|
|
typedef typename ImplTraits::StringType StringType;
|
|
|
|
private:
|
|
/** Pointer the start of the input string, characters may be
|
|
* taken as offsets from here and in original input format encoding.
|
|
*/
|
|
const DataType* m_data;
|
|
|
|
/** Pointer to the next character to be consumed from the input data
|
|
* This is cast to point at the encoding of the original file that
|
|
* was read by the functions installed as pointer in this input stream
|
|
* context instance at file/string/whatever load time.
|
|
*/
|
|
const DataType* m_nextChar;
|
|
|
|
/** Number of characters that can be consumed at this point in time.
|
|
* Mostly this is just what is left in the pre-read buffer, but if the
|
|
* input source is a stream such as a socket or something then we may
|
|
* call special read code to wait for more input.
|
|
*/
|
|
ANTLR_UINT32 m_sizeBuf;
|
|
|
|
/** The line number we are traversing in the input file. This gets incremented
|
|
* by a newline() call in the lexer grammar actions.
|
|
*/
|
|
ANTLR_UINT32 m_line;
|
|
|
|
/** Pointer into the input buffer where the current line
|
|
* started.
|
|
*/
|
|
const DataType* m_currentLine;
|
|
|
|
/** The offset within the current line of the current character
|
|
*/
|
|
ANTLR_INT32 m_charPositionInLine;
|
|
|
|
/** Tracks how deep mark() calls are nested
|
|
*/
|
|
ANTLR_UINT32 m_markDepth;
|
|
|
|
/** List of mark() points in the input stream
|
|
*/
|
|
MarkersType m_markers;
|
|
|
|
/** File name string, set to pointer to memory if
|
|
* you set it manually as it will be free()d
|
|
*/
|
|
StringType m_fileName;
|
|
|
|
/** File number, needs to be set manually to some file index of your devising.
|
|
*/
|
|
ANTLR_UINT32 m_fileNo;
|
|
|
|
/// Character that automatically causes an internal line count
|
|
/// increment.
|
|
///
|
|
ANTLR_UCHAR m_newlineChar;
|
|
|
|
/// Indicates the size, in 8 bit units, of a single character. Note that
|
|
/// the C runtime does not deal with surrogates as this would be
|
|
/// slow and complicated. If this is a UTF-8 stream then this field
|
|
/// will be set to 0. Generally you are best working internally with 32 bit characters
|
|
/// as this is the most efficient.
|
|
///
|
|
ANTLR_UINT8 m_charByteSize;
|
|
|
|
/** Indicates if the data pointer was allocated by us, and so should be freed
|
|
* when the stream dies.
|
|
*/
|
|
bool m_isAllocated;
|
|
|
|
/// Indicates the encoding scheme used in this input stream
|
|
///
|
|
ANTLR_UINT32 m_encoding;
|
|
|
|
/* API */
|
|
public:
|
|
InputStream(const ANTLR_UINT8* fileName, ANTLR_UINT32 encoding);
|
|
InputStream(const ANTLR_UINT8* data, ANTLR_UINT32 encoding, ANTLR_UINT32 size, ANTLR_UINT8* name);
|
|
~InputStream();
|
|
const DataType* get_data() const;
|
|
bool get_isAllocated() const;
|
|
const DataType* get_nextChar() const;
|
|
ANTLR_UINT32 get_sizeBuf() const;
|
|
ANTLR_UINT32 get_line() const;
|
|
const DataType* get_currentLine() const;
|
|
ANTLR_INT32 get_charPositionInLine() const;
|
|
ANTLR_UINT32 get_markDepth() const;
|
|
MarkersType& get_markers();
|
|
const StringType& get_fileName() const;
|
|
ANTLR_UINT32 get_fileNo() const;
|
|
ANTLR_UCHAR get_newlineChar() const;
|
|
ANTLR_UINT8 get_charByteSize() const;
|
|
ANTLR_UINT32 get_encoding() const;
|
|
|
|
void set_data( DataType* data );
|
|
void set_isAllocated( bool isAllocated );
|
|
void set_nextChar( const DataType* nextChar );
|
|
void set_sizeBuf( ANTLR_UINT32 sizeBuf );
|
|
void set_line( ANTLR_UINT32 line );
|
|
void set_currentLine( const DataType* currentLine );
|
|
void set_charPositionInLine( ANTLR_INT32 charPositionInLine );
|
|
void set_markDepth( ANTLR_UINT32 markDepth );
|
|
void set_markers( const MarkersType& markers );
|
|
void set_fileName( const StringType& fileName );
|
|
void set_fileNo( ANTLR_UINT32 fileNo );
|
|
void set_newlineChar( ANTLR_UCHAR newlineChar );
|
|
void set_charByteSize( ANTLR_UINT8 charByteSize );
|
|
void set_encoding( ANTLR_UINT32 encoding );
|
|
|
|
void inc_charPositionInLine();
|
|
void inc_line();
|
|
void inc_markDepth();
|
|
|
|
IntStreamType* get_istream();
|
|
|
|
/** Function that resets the input stream
|
|
*/
|
|
void reset();
|
|
|
|
/** Pointer to a function that reuses and resets an input stream by
|
|
* supplying a new 'source'
|
|
*/
|
|
void reuse(ANTLR_UINT8* inString, ANTLR_UINT32 size, ANTLR_UINT8* name);
|
|
|
|
|
|
/** Function to return the total size of the input buffer. For streams
|
|
* this may be just the total we have available so far. This means of course that
|
|
* the input stream must be careful to accumulate enough input so that any backtracking
|
|
* can be satisfied.
|
|
*/
|
|
ANTLR_UINT32 size();
|
|
|
|
/** Function to return a substring of the input stream. String is returned in allocated
|
|
* memory and is in same encoding as the input stream itself, NOT internal ANTLR_UCHAR form.
|
|
*/
|
|
StringType substr(ANTLR_MARKER start, ANTLR_MARKER stop);
|
|
|
|
/** Function to return the current line number in the input stream
|
|
*/
|
|
ANTLR_UINT32 get_line();
|
|
|
|
/** Function to return the current line buffer in the input stream
|
|
* The pointer returned is directly into the input stream so you must copy
|
|
* it if you wish to manipulate it without damaging the input stream. Encoding
|
|
* is obviously in the same form as the input stream.
|
|
* \remark
|
|
* - Note taht this function wil lbe inaccurate if setLine is called as there
|
|
* is no way at the moment to position the input stream at a particular line
|
|
* number offset.
|
|
*/
|
|
const DataType* getLineBuf();
|
|
|
|
/** Function to return the current offset in the current input stream line
|
|
*/
|
|
ANTLR_UINT32 get_charPositionInLine();
|
|
|
|
/** Function to set the current position in the current line.
|
|
*/
|
|
void set_charPositionInLine(ANTLR_UINT32 position);
|
|
|
|
/** Function to override the default newline character that the input stream
|
|
* looks for to trigger the line/offset and line buffer recording information.
|
|
* \remark
|
|
* - By default the chracter '\n' will be installed as the newline trigger character. When this
|
|
* character is seen by the consume() function then the current line number is incremented and the
|
|
* current line offset is reset to 0. The Pointer for the line of input we are consuming
|
|
* is updated to point to the next character after this one in the input stream (which means it
|
|
* may become invalid if the last newline character in the file is seen (so watch out).
|
|
* - If for some reason you do not want the counters and pointers to be restee, you can set the
|
|
* chracter to some impossible character such as '\0' or whatever.
|
|
* - This is a single character only, so choose the last character in a sequence of two or more.
|
|
* - This is only a simple aid to error reporting - if you have a complicated binary input structure
|
|
* it may not be adequate, but you can always override every function in the input stream with your
|
|
* own of course, and can even write your own complete input stream set if you like.
|
|
* - It is your responsiblity to set a valid character for the input stream type. There is no point
|
|
* setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never
|
|
* trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF
|
|
*/
|
|
void set_newLineChar(ANTLR_UINT32 newlineChar);
|
|
|
|
ANTLR_MARKER index_impl();
|
|
|
|
private:
|
|
/** \brief Use the contents of an operating system file as the input
|
|
* for an input stream.
|
|
*
|
|
* \param fileName Name of operating system file to read.
|
|
* \return
|
|
* - Pointer to new input stream context upon success
|
|
* - One of the ANTLR3_ERR_ defines on error.
|
|
*/
|
|
void createFileStream(const ANTLR_UINT8* fileName);
|
|
|
|
/** \brief Use the supplied 'string' as input to the stream
|
|
*
|
|
* \param data Pointer to the input data
|
|
* \return
|
|
* - Pointer to new input stream context upon success
|
|
* - NULL defines on error.
|
|
*/
|
|
void createStringStream(const ANTLR_UINT8* data);
|
|
void genericSetupStream();
|
|
|
|
/// Determine endianess of the input stream and install the
|
|
/// API required for the encoding in that format.
|
|
///
|
|
void setupInputStream();
|
|
|
|
};
|
|
|
|
/** \brief Structure for track lex input states as part of mark()
|
|
* and rewind() of lexer.
|
|
*/
|
|
template<class ImplTraits>
|
|
class LexState : public ImplTraits::AllocPolicyType
|
|
{
|
|
public:
|
|
typedef typename ImplTraits::StreamDataType DataType;
|
|
|
|
private:
|
|
/** Pointer to the next character to be consumed from the input data
|
|
* This is cast to point at the encoding of the original file that
|
|
* was read by the functions installed as pointer in this input stream
|
|
* context instance at file/string/whatever load time.
|
|
*/
|
|
const DataType* m_nextChar;
|
|
|
|
/** The line number we are traversing in the input file. This gets incremented
|
|
* by a newline() call in the lexer grammer actions.
|
|
*/
|
|
ANTLR_UINT32 m_line;
|
|
|
|
/** Pointer into the input buffer where the current line
|
|
* started.
|
|
*/
|
|
const DataType* m_currentLine;
|
|
|
|
/** The offset within the current line of the current character
|
|
*/
|
|
ANTLR_INT32 m_charPositionInLine;
|
|
|
|
public:
|
|
LexState();
|
|
const DataType* get_nextChar() const;
|
|
ANTLR_UINT32 get_line() const;
|
|
const DataType* get_currentLine() const;
|
|
ANTLR_INT32 get_charPositionInLine() const;
|
|
void set_nextChar( const DataType* nextChar );
|
|
void set_line( ANTLR_UINT32 line );
|
|
void set_currentLine( const DataType* currentLine );
|
|
void set_charPositionInLine( ANTLR_INT32 charPositionInLine );
|
|
};
|
|
|
|
class ParseNullStringException : public std::exception
|
|
{
|
|
virtual const char* what() const throw()
|
|
{
|
|
return "Null String";
|
|
}
|
|
};
|
|
|
|
ANTLR_END_NAMESPACE()
|
|
|
|
#include "antlr3input.inl"
|
|
|
|
#endif /* _ANTLR_INPUT_H */
|