_TAWK, A Simple Interpreter in C++_
by Bruce Eckel


// FIELD.HXX: used by csascii class to build a single field.
// Fields are collected by csascii to create a record.
// by Bruce Eckel,

class field { // one field in a comma-separated ASCII record
istream * input; // where to get the data
char * data;
int length, fsize;
int end_of_file; // flag to indicate the end of file happened
void getfield(); // recursive function to read in a field;
// treats data, length & input as globals
int infield; // flag used by getfield() to determine whether
// it's inside a quoted field
field(istream & instream);
friend ostream& operator<<(ostream &s, field & f) {
s <<;
return s;
int eof() { return end_of_file; } // to check for end
int size() { return fsize;}
int last_length() {return length; }
char * string() { return data; }


// FIELD.CXX: definitions for class field
// A "recursive descent" scanning scheme is used because field
// length is always unknown.
// by Bruce Eckel
#include "field.hxx"

field::field(istream & instream) {
input = &instream;
length = 0;
end_of_file = 0; // set flag to say "we're not at the end"
infield = 0; // set flag to say "we're not inside a field"
data = (char *)0; // to show no memory has been allocated
getfield(); // recursively get characters until end of field

field::~field() {
delete data; // if no memory has been allocated,
// data = (char *)0 so this will have no effect.

// A Comma-separated ASCII field is contained in quotes to allow
// commas within the field; these quotes must be stripped out
void field::getfield() {
char c;
// This happens when DEscending:
if((input->get(c)).eof() ) {
end_of_file++; // just say we reached the end...
else // watch out for the Unix vs. DOS LF/CR problem here:
if (((c != ',') || infield) && (c != '\n')) {
if ( (c != '"') && (c != '\r')) // watch for quotes or CR
length++; // no quotes -- count this character
else {
if ( c == '"')
infield = !infield; // if we weren't inside a field
// and a quote was encountered, we are now inside
// a field. If we were inside a field and a quote
// was found, we're out of the field.
c = 0; // a quote or CR; mark it so it isn't included
getfield(); // recursively get characters in field
// after returning from function call, we jump past
// the following "else" part to finish the recursion
else { // This happens once, when the terminator is found:
fsize = length; // remember how long the string is
data = new char[length + 1]; // space for null terminator
data[length] = '\0'; // highest index is "length"
// when you allocate an array of length + 1
length--; // notice we don't insert the delimiter
// Now the first "if" statement evaluates to TRUE and
// the function rises back up.
// This happens when Ascending:
if ( c ) // if it wasn't a quote or CR,
data[length--] = c; // put chars in as we rise back up...


// CSASCII.HXX: class to manipulate comma-separated ASCII
// database files.
//by Bruce Eckel
#include "field.hxx"

class csascii { // manipulates comma-separated ascii files,
// generated by most database management systems (generated and
// used by the BASIC programming language). Each field
// is separated by a comma; records are separated by newlines.
int fieldcount;
field ** data; // an array to hold the entire record
istream * datafile; // file with comma separated ASCII input
int readrecord(); // private function to read a record
csascii( char * filename ); // Open file, get first record
~csascii(); // destructor
int next(); // get next record, return 0 when EOF
field & operator[](int index); // select a field
int number_of_fields() { return fieldcount; }


// CSASCII.CXX: function definitions for comma-separated
// ascii database manipulation class
// by Bruce Eckel,
#include "csascii.hxx"

int csascii::readrecord() {
for (int fieldnum = 0; fieldnum < fieldcount; fieldnum++ ) {
data[fieldnum] = new field(*datafile);
if (data[fieldnum]->eof()) return 0;
return 1;

csascii::csascii( char * filename ) {
char c;
fieldcount = 0;
int quote = 0;
// first, determine the number of fields in a record:
// See text for dangers of opening files this way:
istream infile(new filebuf->open(filename, input));
while(infile.get(c), c != '\n') {
// keep track of being inside a quoted string:
if (c == '"') quote = !quote;
// fields are delimited by unquoted commas:
if ( c == ',' && !quote)
} // infile goes out of scope; file closed
fieldcount++; // last field terminated by newline, not comma
// an array of field pointers:
data = new field * [ fieldcount ];
// re-open at start; dynamically allocate so it isn't scoped:
datafile = new istream(new filebuf->open(filename, input));

csascii::~csascii() {
delete data;
delete datafile; // calls istream destructor to close file

int csascii::next() {
for (int i = 0; i < fieldcount; i++ )
delete data[i]; // free all the data storage
return readrecord(); // 0 when end of file

field & csascii::operator[](int index) {
if (index >= fieldcount) {
cerr << "index too large for number of fields in record\n";
return *(data[index]);


// LOOKUP.CXX: simple use of csascii to find name in a database
// by Bruce Eckel,
#include "csascii.hxx"

main(int argc, char ** argv) {
if (argc < 2) {
cerr << "usage: lookup lastname\n";
// This puts the database file in the root directory:
csascii file("\\ppquick.asc"); // create object & open file
int found = 0; // indicates one record was found
do {
if (strcmp(file[0].string(),argv[1]) == 0) {
found++; // found one. File is sorted, so if we stop
// finding them, quit instead of wasting time.
cout << chr(27) << "[2J"; // ANSI clear screen
for (int i = 0; i < file.number_of_fields(); i++)
cout << file[i] << "\n";
cout << chr(27) << "[7m" << "press any key" <<
chr(27) << "[0m";
if( getch() == 27) break;
} else if (found) exit(0); // quit if that was the last
} while (;


// PARSE.HXX: class to parse a tawk script file. Creates
// a structure which can be used at run-time to "execute"
// the tawk script.
// by Bruce Eckel,

// types of tokens the scanner can find:
enum tokentype {
fieldnumber, string, if_, else_, endif_, phase_change

// preamble and conclusion of the tawk script are only executed
// once, while main is executed once for every data record
enum phase { preamble, tmain, conclusion};

class token {
tokentype ttype;
union { // an "anonymous union"
int fieldnum; // if type is a fieldnumber
unsigned char * literal; // if type is a string
int if_level; // if this is an if_, then_, or else_
// private functions:
void get_token(); // recursive descent scanner
// Functions to help in scanning:
void getnext(char & c); // used by get_token();
unsigned char get_value(char delimiter, char * msg);
void dumpline(); // for @! comments
void error(char * msg = "", char * msg2 = "");
token(istream & input);
friend ostream & operator<<(ostream &s, token &t);
int field_number() { return fieldnum; }
int token_type() { return ttype; }
int nesting_level() { return if_level;}

// The following is called a "container class," since its sole
// purpose is to hold a list of objects (tokens, in this case):
class parse_array {
token ** tokenarray; // an array of token pointers
istream * parse_stream;
int token_count;
int end; // the size of the array
phase p_section; // of the program (preamble, etc.)
void build_array(); // another recursive function
parse_array(istream & input);
int size() { return end; } // how big is it?
token & operator[](int index); // select a token
phase section() { return p_section; }


// PARSE.CXX: class parse function definitions
// by Bruce Eckel,
#include "csascii.hxx"
#include "parse.hxx"

// The following are "file static," which means no one outside
// this file can know about them. This is the meaning when a
// global variable is declared "static."
static istream * tokenstream;
static int length; // to remember size of string
static int line_number = 1; // line counting for errors
static int if_counter = 0; // monitors "if" statement nesting
static phase program_section = preamble; // ... until @main
static int end_of_file = 0; // zero means not end of file

token::token(istream & input) {
// initialize values and start the descent
tokenstream = &input;
length = 0;
get_token(); // recursively get characters to end of token

token::~token() { // delete heap if any has been allocated:
if (ttype == string)
delete literal;

void token::error(char * msg, char * msg2) {
cerr << "token error on line " << line_number << ": " <<
msg << " " << msg2 << "\n";

ostream & operator<<(ostream &s, token &t) {
switch (t.ttype) {
case string:
s << (char *)t.literal;
case fieldnumber: // only for testing
s << " fieldnumber: " << t.fieldnum << "\n";
return s;

// Get a character from the tokenstream, checking for
// end-of-file and newlines
void token::getnext(char & c) {
error("attempt to read after @end statement\n",
"missing @conclusion ?");
if((tokenstream->get(c)).eof() )
error("@end statement missing");
if (c == '\n')
line_number++; // keep track of the line count

// See text for description of tokens
void token::get_token() {
char c;
// This happens when DEscending:
if ( c == '@') {
if (length == 0) { // length 0 means start of token
switch(c) {
case '!': // comment line
dumpline(); // dump the comment
get_token(); // get a real token
case 'p' : case 'P' : // preamble statement
if ( program_section != preamble )
error("only one preamble allowed");
dumpline(); // just for looks, ignore it
get_token(); // get a real token
case 'm' : case 'M' : // start of main loop
dumpline(); // toss rest of line
program_section = tmain;
ttype = phase_change;
return; // very simple token
case 'c' : case 'C' : // start conclusion
program_section = conclusion;
ttype = phase_change;
return; // very simple token
case 'e' : case 'E': // end statement
end_of_file++; // set flag
ttype = fieldnumber; // so destructor doesn't
// delete free store for this token.
if (if_counter)
error("unclosed 'if' statement(s)");
case '(' :
if ( program_section == preamble ||
program_section == conclusion )
error("@() not allowed in preamble or conclusion");
fieldnum = get_value(')',"@()");
ttype = fieldnumber;
// This is a complete token, so quit
case '<' :
c = get_value('>',"@<>");
get_token(); // get more...
case '?' : // beginning of an "if" statement
if ( program_section == preamble ||
program_section == conclusion )
error("@? not allowed in preamble or conclusion");
fieldnum = get_value('@',"@?@");
ttype = if_;
getnext(c); // just eat the colon
if(c != ':')
error("@? must be followed by @: (then)");
if_level = ++if_counter; // for nesting
case '~' : // the "else" part of an "if" statement
ttype = else_;
if_level = if_counter;
case '.' : // "endif" terminator of an "if" statement
ttype = endif_;
if_level = if_counter--;
if(if_counter < 0)
error("incorrect nesting of if-then-else clauses");
case '@' : // two '@' in a row mean print an '@'
length++; // just leave '@' as the value of c
error("'@' must be followed by:",
"'(', '<', '?',':','~','.','p','m','c' or '@'");
} else { // an '@' in the middle of a string; terminate
// the string. Putback() is part of the stream class.
// It is only safe to put one character back on the input
tokenstream->putback(c); // to be used by the next token
// allocate space, put the null in and return up the stack
literal = new unsigned char[length + 1]; // space for '\0'
literal[length--] = '\0'; // string delimiter
ttype = string; // what kind of token this is
return; // back up the stack
} else { // not an '@', must be plain text
// This occurs on the "tail" of the recursion:
literal[length--] = c; // put chars in as we rise back up...

// This function is used by get_token when it encounters a @(
// or a @< to get a number until it finds "delimiter."
// If an error occurs, msg is used to notify the user what
// kind of statement it is.
unsigned char token::get_value(char delimiter, char * msg) {
char c;
char buf[5];
int i = 0;
while(getnext(c), c != delimiter) {
if (!isdigit(c))
error("must use only digits inside", msg);
buf[i++] = c;
buf[i] = 0;
return atoi(buf);

void token::dumpline() { // called when '@!' encountered
char c;
while(getnext(c), c != '\n')
; // just eat characters until newline

// Since there's no way to know how big a parse_array is
// going to be until the entire tawkfile has been tokenized,
// the recursive approach is again used:

parse_array::parse_array(istream & input) {
parse_stream = &input;
token_count = 0;
p_section = program_section; // so we know at run-time

void parse_array::build_array() {
token * tk = new token(*parse_stream);
if( ! end_of_file && tk->token_type() != phase_change) {
// normal token, not end of file or phase change:
// recursively get tokens until eof or phase change:
} else { // end of file or phase change
// only done once per object:
// allocate memory and return up the stack
tokenarray = new token * [end = token_count];
if(token_count) token_count--; // only if non-zero
tokenarray[token_count--] = tk; // performed on the "tail"

parse_array::~parse_array() {
for (int i = 0; i < end; i++)
delete tokenarray[i];
delete tokenarray;

token & parse_array::operator[](int index) {
if ( index >= end ) {
cerr << "parse_array error: index " << index
<< " out of bounds\n";
return *tokenarray[index];


// TAWK.CXX: parses a tawk script and reads an ascii file;
// generates results according to the tawk script.
// by Bruce Eckel,
#include "csascii.hxx"
#include "parse.hxx"

main (int argc, char * argv[]) {
int screen = 0; // flag set true if screen output desired
if (argc < 3) {
cerr << "usage: tawk tawkfile datafile\n" <<
"trailing -s pages output to screen";
if (argc == 4) {
if (argv[3][0] != '-') {
cerr << "must use '-' before trailing flag\n";
} else
if (argv[3][1] != 's') {
cerr << "'s' is only trailing flag allowed";
} else
screen++; // set screen output flag true
istream tawkfile(new filebuf->open(argv[1], input));
parse_array Apreamble(tawkfile); // the @preamble
parse_array Amain(tawkfile); // the @main section
parse_array Aconclusion(tawkfile); // the @conclusion
csascii datafile(argv[2]); // make a comma-separated ASCII
// object from the second arg
// ------ @preamble ------
for (int i = 0; i < Apreamble.size(); i++)
cout << Apreamble[i]; // preamble can only contain strings
if(screen) {
// ANSI reverse video sequence:
cout << chr(27) << "[7m" << "press any key" <<
chr(27) << "[0m";
// ------ The Central Loop (@main) -------
do { // for each record in the data file
if(screen) cout << chr(27) << "[2J"; // ANSI clear screen
for(int i = 0; i < Amain.size(); i++) {
switch(Amain[i].token_type()) {
case fieldnumber:
cout << datafile[Amain[i].field_number()];
case string:
cout << Amain[i];
case if_:
int fn = Amain[i].field_number();
if (datafile[fn].size() == 0) { // conditional false
int level = Amain[i].nesting_level();
// find the "else" statement on the same level:
while ( !(Amain[i].token_type() == else_
&& Amain[i].nesting_level() == level))
} // conditional true -- just continue
case else_: // an "if" conditional was true so skip
// all the statements in the "else" clause
int level = Amain[i].nesting_level();
// find the "endif" statement on the same level:
while ( !(Amain[i].token_type() == endif_
&& Amain[i].nesting_level() == level))
case endif_: // after performing the "else" clause
break; // ignore it; only used to find the end
// of the conditional when "if" is true.
default: // should never happen (caught in parsing)
cerr << "unknown statement encountered at run-time\n";
if(screen) {
cout << chr(27) << "[7m" <<
"press a key (ESC quits)" << chr(27) << "[0m";
if( getch() == 27) break;
} while (; // matches do { ...
// ------ @conclusion ------
for ( i = 0; i < Aconclusion.size(); i++)
cout << Aconclusion[i]; //conclusion contains only strings


# makefile for tawk.exe & lookup.exe
# Zortech C++:
CPP = ztc
# Glockenspiel C++ w/ MSC 4:
#CPP = ccxx !4

all: tawk.exe lookup.exe

tawk.exe : tawk.obj parse.obj csascii.obj field.obj
$(CPP) tawk.obj parse.obj csascii.obj field.obj

lookup.exe : lookup.cxx csascii.obj field.obj
$(CPP) lookup.cxx csascii.obj field.obj

tawk.obj : tawk.cxx parse.hxx csascii.hxx field.hxx
$(CPP) -c tawk.cxx

parse.obj : parse.cxx parse.hxx
$(CPP) -c parse.cxx

csascii.obj : csascii.cxx csascii.hxx field.hxx
$(CPP) -c csascii.cxx

field.obj : field.cxx field.hxx
$(CPP) -c field.cxx


@! A tawk script to reformat a comma-separated ASCII file
@! with 6 fields. This creates a new CS-ASCII file with
@! fields 4 and 5 combined.
"@(0)","@(1)","@(2)","@(3)","@(4)@?4@: @~@.@(5)"


@! Tawkfile to create a tiny phone listing for a wallet
@! on a Hewlett-Packard Laserjet-compatible printer
@! From a comma-separated ASCII file generated by a DBMS
@<27>&l5C@! approximately 10 lines per inch
@<27>(s16.66H@! small typeface, built into Laserjet
@! last, first, (area code) phone1
@ phone2, if it exists
@<27>E @! Reset the Laserjet


class tiny {
// private stuff here (this is a comment)
int i;
public: // public stuff here:
print() { // an "in-line" function
printf("i = %d\n",i);
tiny(int j); // constructors have the class name
~tiny() {} // destructors use a tilde
}; // classes end with a brace and a semicolon

tiny::tiny(int j) { // non inline definition
i = j;

main() {
tiny A(2); // implicit constructor call
// A.i = 30; // error! private member
A.print(); // calling a member function
// implicit destructor call at end of scope


#include // cout automatically defined
main() {
cout << "Hello, world!\n" << "I am "
<< 6 << "today!\n";


filebuf f1;
if ([1],input) == 0) {
cout << "cannot open " << argv[1] << "\n";
istream infile(&f1);


"Ball","Mike","Oregon Software C++ Compiler"
"Bright","Walter","Zortech C++ Compiler"
"Carolan","John","Glockenspiel C++ Translator"
"Stroustrup","Bjarne","AT&T, C++ Creator"
"Tiemann","Michael","Free Software Foundation C++ Compiler"

