Mercurial > dnsbl
view src/tokenizer.cpp @ 94:e107ade3b1c0
fix dos line terminators
author | carl |
---|---|
date | Wed, 21 Sep 2005 12:35:29 -0700 |
parents | 1142e46be550 |
children | 91c27c00048f |
line wrap: on
line source
/* Copyright (c) 2004 Carl Byington - 510 Software Group, released under the GPL version 2 or any later version at your choice available at http://www.fsf.org/licenses/gpl.txt */ #include "dnsbl.h" static char* tokenizer_version="$Id$"; enum state {s_init, s_token, s_string, s_ignore, // whitespace s_eol, // ignore to eol end_state, s_term, // token terminator s_single, s_string1, // first " of string s_string2, // last " of string s_slash, // possible start of ignore to eol }; typedef state PARSE[end_state]; static PARSE parse_table[256] = { // s_init s_token s_string s_ignore s_eol { s_single, s_term, s_string, s_single, s_eol, }, // 0x00 { s_single, s_term, s_string, s_single, s_eol, }, // 0x01 { s_single, s_term, s_string, s_single, s_eol, }, // 0x02 { s_single, s_term, s_string, s_single, s_eol, }, // 0x03 { s_single, s_term, s_string, s_single, s_eol, }, // 0x04 { s_single, s_term, s_string, s_single, s_eol, }, // 0x05 { s_single, s_term, s_string, s_single, s_eol, }, // 0x06 { s_single, s_term, s_string, s_single, s_eol, }, // 0x07 { s_single, s_term, s_string, s_single, s_eol, }, // 0x08 { s_ignore, s_term, s_string, s_ignore, s_eol, }, // 0x09 <tab> { s_ignore, s_term, s_string2, s_ignore, s_ignore, }, // 0x0a <lf> { s_single, s_term, s_string, s_single, s_eol, }, // 0x0b { s_single, s_term, s_string, s_single, s_eol, }, // 0x0c { s_ignore, s_term, s_string2, s_ignore, s_eol, }, // 0x0d <cr> { s_single, s_term, s_string, s_single, s_eol, }, // 0x0e { s_single, s_term, s_string, s_single, s_eol, }, // 0x0f { s_single, s_term, s_string, s_single, s_eol, }, // 0x10 { s_single, s_term, s_string, s_single, s_eol, }, // 0x11 xon char { s_single, s_term, s_string, s_single, s_eol, }, // 0x12 { s_single, s_term, s_string, s_single, s_eol, }, // 0x13 xoff char { s_single, s_term, s_string, s_single, s_eol, }, // 0x14 { s_single, s_term, s_string, s_single, s_eol, }, // 0x15 { s_single, s_term, s_string, s_single, s_eol, }, // 0x16 { s_single, s_term, s_string, s_single, s_eol, }, // 0x17 { s_single, s_term, s_string, s_single, s_eol, }, // 0x18 { s_single, s_term, s_string, s_single, s_eol, }, // 0x19 { s_single, s_term, s_string, s_single, s_eol, }, // 0x1a { s_single, s_term, s_string, s_single, s_eol, }, // 0x1b { s_single, s_term, s_string, s_single, s_eol, }, // 0x1c { s_single, s_term, s_string, s_single, s_eol, }, // 0x1d { s_single, s_term, s_string, s_single, s_eol, }, // 0x1e { s_single, s_term, s_string, s_single, s_eol, }, // 0x1f { s_ignore, s_term, s_string, s_ignore, s_eol, }, // 0x20 space { s_single, s_term, s_string, s_single, s_eol, }, // 0x21 ! { s_string1, s_term, s_string2, s_string1, s_eol, }, // 0x22 " { s_eol, s_term, s_string, s_eol, s_eol, }, // 0x23 # { s_single, s_term, s_string, s_single, s_eol, }, // 0x24 $ { s_single, s_term, s_string, s_single, s_eol, }, // 0x25 % { s_single, s_term, s_string, s_single, s_eol, }, // 0x26 & { s_single, s_term, s_string, s_single, s_eol, }, // 0x27 ' { s_single, s_term, s_string, s_single, s_eol, }, // 0x28 ( { s_single, s_term, s_string, s_single, s_eol, }, // 0x29 ) { s_single, s_term, s_string, s_single, s_eol, }, // 0x2A * { s_single, s_token, s_string, s_single, s_eol, }, // 0x2B + { s_single, s_term, s_string, s_single, s_eol, }, // 0x2C , { s_single, s_token, s_string, s_single, s_eol, }, // 0x2D - { s_single, s_token, s_string, s_single, s_eol, }, // 0x2E . { s_slash, s_token, s_string, s_slash, s_eol, }, // 0x2F / { s_token, s_token, s_string, s_token, s_eol, }, // 0x30 0 { s_token, s_token, s_string, s_token, s_eol, }, // 0x31 1 { s_token, s_token, s_string, s_token, s_eol, }, // 0x32 2 { s_token, s_token, s_string, s_token, s_eol, }, // 0x33 3 { s_token, s_token, s_string, s_token, s_eol, }, // 0x34 4 { s_token, s_token, s_string, s_token, s_eol, }, // 0x35 5 { s_token, s_token, s_string, s_token, s_eol, }, // 0x36 6 { s_token, s_token, s_string, s_token, s_eol, }, // 0x37 7 { s_token, s_token, s_string, s_token, s_eol, }, // 0x38 8 { s_token, s_token, s_string, s_token, s_eol, }, // 0x39 9 { s_single, s_term, s_string, s_single, s_eol, }, // 0x3A : { s_single, s_term, s_string, s_single, s_eol, }, // 0x3B ; { s_single, s_term, s_string, s_single, s_eol, }, // 0x3C < { s_single, s_token, s_string, s_single, s_eol, }, // 0x3D = { s_single, s_term, s_string, s_single, s_eol, }, // 0x3E > { s_single, s_term, s_string, s_single, s_eol, }, // 0x3F ? { s_single, s_token, s_string, s_single, s_eol, }, // 0x40 @ { s_token, s_token, s_string, s_token, s_eol, }, // 0x41 A { s_token, s_token, s_string, s_token, s_eol, }, // 0x42 B { s_token, s_token, s_string, s_token, s_eol, }, // 0x43 C { s_token, s_token, s_string, s_token, s_eol, }, // 0x44 D { s_token, s_token, s_string, s_token, s_eol, }, // 0x45 E { s_token, s_token, s_string, s_token, s_eol, }, // 0x46 F { s_token, s_token, s_string, s_token, s_eol, }, // 0x47 G { s_token, s_token, s_string, s_token, s_eol, }, // 0x48 H { s_token, s_token, s_string, s_token, s_eol, }, // 0x49 I { s_token, s_token, s_string, s_token, s_eol, }, // 0x4A J { s_token, s_token, s_string, s_token, s_eol, }, // 0x4B K { s_token, s_token, s_string, s_token, s_eol, }, // 0x4C L { s_token, s_token, s_string, s_token, s_eol, }, // 0x4D M { s_token, s_token, s_string, s_token, s_eol, }, // 0x4E N { s_token, s_token, s_string, s_token, s_eol, }, // 0x4F O { s_token, s_token, s_string, s_token, s_eol, }, // 0x50 P { s_token, s_token, s_string, s_token, s_eol, }, // 0x51 Q { s_token, s_token, s_string, s_token, s_eol, }, // 0x52 R { s_token, s_token, s_string, s_token, s_eol, }, // 0x53 S { s_token, s_token, s_string, s_token, s_eol, }, // 0x54 T { s_token, s_token, s_string, s_token, s_eol, }, // 0x55 U { s_token, s_token, s_string, s_token, s_eol, }, // 0x56 V { s_token, s_token, s_string, s_token, s_eol, }, // 0x57 W { s_token, s_token, s_string, s_token, s_eol, }, // 0x58 X { s_token, s_token, s_string, s_token, s_eol, }, // 0x59 Y { s_token, s_token, s_string, s_token, s_eol, }, // 0x5A Z { s_single, s_term, s_string, s_single, s_eol, }, // 0x5B [ { s_single, s_term, s_string, s_single, s_eol, }, // 0x5C backslash { s_single, s_term, s_string, s_single, s_eol, }, // 0x5D ] { s_single, s_term, s_string, s_single, s_eol, }, // 0x5E ^ { s_single, s_token, s_string, s_single, s_eol, }, // 0x5F _ { s_single, s_term, s_string, s_single, s_eol, }, // 0x60 ` { s_token, s_token, s_string, s_token, s_eol, }, // 0x61 a { s_token, s_token, s_string, s_token, s_eol, }, // 0x62 b { s_token, s_token, s_string, s_token, s_eol, }, // 0x63 c { s_token, s_token, s_string, s_token, s_eol, }, // 0x64 d { s_token, s_token, s_string, s_token, s_eol, }, // 0x65 e { s_token, s_token, s_string, s_token, s_eol, }, // 0x66 f { s_token, s_token, s_string, s_token, s_eol, }, // 0x67 g { s_token, s_token, s_string, s_token, s_eol, }, // 0x68 h { s_token, s_token, s_string, s_token, s_eol, }, // 0x69 i { s_token, s_token, s_string, s_token, s_eol, }, // 0x6A j { s_token, s_token, s_string, s_token, s_eol, }, // 0x6B k { s_token, s_token, s_string, s_token, s_eol, }, // 0x6C l { s_token, s_token, s_string, s_token, s_eol, }, // 0x6D m { s_token, s_token, s_string, s_token, s_eol, }, // 0x6E n { s_token, s_token, s_string, s_token, s_eol, }, // 0x6F o { s_token, s_token, s_string, s_token, s_eol, }, // 0x70 p { s_token, s_token, s_string, s_token, s_eol, }, // 0x71 q { s_token, s_token, s_string, s_token, s_eol, }, // 0x72 r { s_token, s_token, s_string, s_token, s_eol, }, // 0x73 s { s_token, s_token, s_string, s_token, s_eol, }, // 0x74 t { s_token, s_token, s_string, s_token, s_eol, }, // 0x75 u { s_token, s_token, s_string, s_token, s_eol, }, // 0x76 v { s_token, s_token, s_string, s_token, s_eol, }, // 0x77 w { s_token, s_token, s_string, s_token, s_eol, }, // 0x78 x { s_token, s_token, s_string, s_token, s_eol, }, // 0x79 y { s_token, s_token, s_string, s_token, s_eol, }, // 0x7A z { s_single, s_term, s_string, s_single, s_eol, }, // 0x7B { { s_single, s_term, s_string, s_single, s_eol, }, // 0x7C | { s_single, s_term, s_string, s_single, s_eol, }, // 0x7D } { s_single, s_term, s_string, s_single, s_eol, }, // 0x7E ~ { s_single, s_term, s_string, s_single, s_eol, }, // 0x7f { s_single, s_term, s_string, s_single, s_eol, }, // 0x80 { s_single, s_term, s_string, s_single, s_eol, }, // 0x81 { s_single, s_term, s_string, s_single, s_eol, }, // 0x82 { s_single, s_term, s_string, s_single, s_eol, }, // 0x83 { s_single, s_term, s_string, s_single, s_eol, }, // 0x84 { s_single, s_term, s_string, s_single, s_eol, }, // 0x85 { s_single, s_term, s_string, s_single, s_eol, }, // 0x86 { s_single, s_term, s_string, s_single, s_eol, }, // 0x87 { s_single, s_term, s_string, s_single, s_eol, }, // 0x88 { s_single, s_term, s_string, s_single, s_eol, }, // 0x89 { s_single, s_term, s_string, s_single, s_eol, }, // 0x8a { s_single, s_term, s_string, s_single, s_eol, }, // 0x8b { s_single, s_term, s_string, s_single, s_eol, }, // 0x8c { s_single, s_term, s_string, s_single, s_eol, }, // 0x8d { s_single, s_term, s_string, s_single, s_eol, }, // 0x8e { s_single, s_term, s_string, s_single, s_eol, }, // 0x8f { s_single, s_term, s_string, s_single, s_eol, }, // 0x90 { s_single, s_term, s_string, s_single, s_eol, }, // 0x91 { s_single, s_term, s_string, s_single, s_eol, }, // 0x92 { s_single, s_term, s_string, s_single, s_eol, }, // 0x93 { s_single, s_term, s_string, s_single, s_eol, }, // 0x94 { s_single, s_term, s_string, s_single, s_eol, }, // 0x95 { s_single, s_term, s_string, s_single, s_eol, }, // 0x96 { s_single, s_term, s_string, s_single, s_eol, }, // 0x97 { s_single, s_term, s_string, s_single, s_eol, }, // 0x98 { s_single, s_term, s_string, s_single, s_eol, }, // 0x99 { s_single, s_term, s_string, s_single, s_eol, }, // 0x9a { s_single, s_term, s_string, s_single, s_eol, }, // 0x9b { s_single, s_term, s_string, s_single, s_eol, }, // 0x9c { s_single, s_term, s_string, s_single, s_eol, }, // 0x9d { s_single, s_term, s_string, s_single, s_eol, }, // 0x9e { s_single, s_term, s_string, s_single, s_eol, }, // 0x9f { s_single, s_term, s_string, s_single, s_eol, }, // 0xa0 { s_single, s_term, s_string, s_single, s_eol, }, // 0xa1 { s_single, s_term, s_string, s_single, s_eol, }, // 0xa2 { s_single, s_term, s_string, s_single, s_eol, }, // 0xa3 { s_single, s_term, s_string, s_single, s_eol, }, // 0xa4 { s_single, s_term, s_string, s_single, s_eol, }, // 0xa5 { s_single, s_term, s_string, s_single, s_eol, }, // 0xa6 { s_single, s_term, s_string, s_single, s_eol, }, // 0xa7 { s_single, s_term, s_string, s_single, s_eol, }, // 0xa8 { s_single, s_term, s_string, s_single, s_eol, }, // 0xa9 { s_single, s_term, s_string, s_single, s_eol, }, // 0xaa { s_single, s_term, s_string, s_single, s_eol, }, // 0xab { s_single, s_term, s_string, s_single, s_eol, }, // 0xac { s_single, s_term, s_string, s_single, s_eol, }, // 0xad { s_single, s_term, s_string, s_single, s_eol, }, // 0xae { s_single, s_term, s_string, s_single, s_eol, }, // 0xaf { s_single, s_term, s_string, s_single, s_eol, }, // 0xb0 { s_single, s_term, s_string, s_single, s_eol, }, // 0xb1 { s_single, s_term, s_string, s_single, s_eol, }, // 0xb2 { s_single, s_term, s_string, s_single, s_eol, }, // 0xb3 { s_single, s_term, s_string, s_single, s_eol, }, // 0xb4 { s_single, s_term, s_string, s_single, s_eol, }, // 0xb5 { s_single, s_term, s_string, s_single, s_eol, }, // 0xb6 { s_single, s_term, s_string, s_single, s_eol, }, // 0xb7 { s_single, s_term, s_string, s_single, s_eol, }, // 0xb8 { s_single, s_term, s_string, s_single, s_eol, }, // 0xb9 { s_single, s_term, s_string, s_single, s_eol, }, // 0xba { s_single, s_term, s_string, s_single, s_eol, }, // 0xbb { s_single, s_term, s_string, s_single, s_eol, }, // 0xbc { s_single, s_term, s_string, s_single, s_eol, }, // 0xbd { s_single, s_term, s_string, s_single, s_eol, }, // 0xbe { s_single, s_term, s_string, s_single, s_eol, }, // 0xbf { s_single, s_term, s_string, s_single, s_eol, }, // 0xc0 { s_single, s_term, s_string, s_single, s_eol, }, // 0xc1 { s_single, s_term, s_string, s_single, s_eol, }, // 0xc2 { s_single, s_term, s_string, s_single, s_eol, }, // 0xc3 { s_single, s_term, s_string, s_single, s_eol, }, // 0xc4 { s_single, s_term, s_string, s_single, s_eol, }, // 0xc5 { s_single, s_term, s_string, s_single, s_eol, }, // 0xc6 { s_single, s_term, s_string, s_single, s_eol, }, // 0xc7 { s_single, s_term, s_string, s_single, s_eol, }, // 0xc8 { s_single, s_term, s_string, s_single, s_eol, }, // 0xc9 { s_single, s_term, s_string, s_single, s_eol, }, // 0xca { s_single, s_term, s_string, s_single, s_eol, }, // 0xcb { s_single, s_term, s_string, s_single, s_eol, }, // 0xcc { s_single, s_term, s_string, s_single, s_eol, }, // 0xcd { s_single, s_term, s_string, s_single, s_eol, }, // 0xce { s_single, s_term, s_string, s_single, s_eol, }, // 0xcf { s_single, s_term, s_string, s_single, s_eol, }, // 0xd0 { s_single, s_term, s_string, s_single, s_eol, }, // 0xd1 { s_single, s_term, s_string, s_single, s_eol, }, // 0xd2 { s_single, s_term, s_string, s_single, s_eol, }, // 0xd3 { s_single, s_term, s_string, s_single, s_eol, }, // 0xd4 { s_single, s_term, s_string, s_single, s_eol, }, // 0xd5 { s_single, s_term, s_string, s_single, s_eol, }, // 0xd6 { s_single, s_term, s_string, s_single, s_eol, }, // 0xd7 { s_single, s_term, s_string, s_single, s_eol, }, // 0xd8 { s_single, s_term, s_string, s_single, s_eol, }, // 0xd9 { s_single, s_term, s_string, s_single, s_eol, }, // 0xda { s_single, s_term, s_string, s_single, s_eol, }, // 0xdb { s_single, s_term, s_string, s_single, s_eol, }, // 0xdc { s_single, s_term, s_string, s_single, s_eol, }, // 0xdd { s_single, s_term, s_string, s_single, s_eol, }, // 0xde { s_single, s_term, s_string, s_single, s_eol, }, // 0xdf { s_single, s_term, s_string, s_single, s_eol, }, // 0xe0 { s_single, s_term, s_string, s_single, s_eol, }, // 0xe1 { s_single, s_term, s_string, s_single, s_eol, }, // 0xe2 { s_single, s_term, s_string, s_single, s_eol, }, // 0xe3 { s_single, s_term, s_string, s_single, s_eol, }, // 0xe4 { s_single, s_term, s_string, s_single, s_eol, }, // 0xe5 { s_single, s_term, s_string, s_single, s_eol, }, // 0xe6 { s_single, s_term, s_string, s_single, s_eol, }, // 0xe7 { s_single, s_term, s_string, s_single, s_eol, }, // 0xe8 { s_single, s_term, s_string, s_single, s_eol, }, // 0xe9 { s_single, s_term, s_string, s_single, s_eol, }, // 0xea { s_single, s_term, s_string, s_single, s_eol, }, // 0xeb { s_single, s_term, s_string, s_single, s_eol, }, // 0xec { s_single, s_term, s_string, s_single, s_eol, }, // 0xed { s_single, s_term, s_string, s_single, s_eol, }, // 0xee { s_single, s_term, s_string, s_single, s_eol, }, // 0xef { s_single, s_term, s_string, s_single, s_eol, }, // 0xf0 { s_single, s_term, s_string, s_single, s_eol, }, // 0xf1 { s_single, s_term, s_string, s_single, s_eol, }, // 0xf2 { s_single, s_term, s_string, s_single, s_eol, }, // 0xf3 { s_single, s_term, s_string, s_single, s_eol, }, // 0xf4 { s_single, s_term, s_string, s_single, s_eol, }, // 0xf5 { s_single, s_term, s_string, s_single, s_eol, }, // 0xf6 { s_single, s_term, s_string, s_single, s_eol, }, // 0xf7 { s_single, s_term, s_string, s_single, s_eol, }, // 0xf8 { s_single, s_term, s_string, s_single, s_eol, }, // 0xf9 { s_single, s_term, s_string, s_single, s_eol, }, // 0xfa { s_single, s_term, s_string, s_single, s_eol, }, // 0xfb { s_single, s_term, s_string, s_single, s_eol, }, // 0xfc { s_single, s_term, s_string, s_single, s_eol, }, // 0xfd { s_single, s_term, s_string, s_single, s_eol, }, // 0xfe { s_single, s_term, s_string, s_single, s_eol, }, // 0xff }; TOKEN::TOKEN(char *fn, string_set *includes) { pushed = false; include_files = includes; include(fn); } TOKEN::~TOKEN() { while (!streams.empty()) pop(); } void TOKEN::pop() { ifstream *is = streams.front(); char *fn = filenames.front(); streams.pop_front(); filenames.pop_front(); filenamess.erase(fn); linenumbers.pop_front(); is->close(); delete is; } void TOKEN::push_char(u_char c) { pushed = true; pushed_char = c; } bool TOKEN::next_char(u_char &uc) { if (pushed) { uc = (u_char)tolower((char)pushed_char); pushed = false; return true; } while (!streams.empty() && streams.front()->eof()) { pop(); } if (streams.empty()) return false; ifstream *is = streams.front(); uc = (u_char)is->get(); if (is->eof()) return next_char(uc); if (uc == (u_char)'\n') { int &line = linenumbers.front(); line++; } uc = (u_char)tolower((char)uc); return true; } bool TOKEN::include(char *fn) { string_set::iterator i = filenamess.find(fn); if (i != filenamess.end()) { my_syslog("redundant or recursive include file detected"); return false; } ifstream *is = new ifstream; is->open(fn); if (is->fail()) { char buf[1000]; snprintf(buf, sizeof(buf), "include file %s not found", fn); token_error(buf); return false; } string_set &inc = *include_files; inc.insert(fn); streams.push_front(is); filenames.push_front(fn); filenamess.insert(fn); linenumbers.push_front(1); return true; } char *TOKEN::next() { if (!pending_tokens.empty()) { char *t = pending_tokens.front(); pending_tokens.pop_front(); return t; } if (streams.empty()) return NULL; const int PENDING_LIMIT = 1000; static u_char buffer[PENDING_LIMIT]; int count = 0; state st = s_init; while (true) { if (count == (PENDING_LIMIT-1)) { token_error("token too long"); break; } if (st >= end_state) { token_error("finite state machine error"); break; } u_char c; if (!next_char(c)) break; st = parse_table[c][st]; switch (st) { case s_string: case s_token: { buffer[count++] = c; } break; case s_term: { push_char(c); st = s_init; } break; case s_string1: { st = s_string; } break; case s_string2: { st = s_init; } break; case s_single: { buffer[count++] = c; st = s_init; } break; case s_ignore: case s_eol: { } break; case s_slash: { buffer[count++] = c; if (next_char(c)) { if (c == (u_char)'/') { // start of ignore to eol on // count--; st = s_eol; } else { // not a // token, just return this single / push_char(c); st = s_init; } } else { // cannot get another char st = s_init; } } break; default: { token_error(); token_error("unknown state %d %s \n", st, " "); } break; } if (st == s_init) break; } buffer[count] = '\0'; if (count == 0) return NULL; char *t = register_string((char*)buffer); if (t == token_include) { char *f = next(); // should be file name char *s = next(); // should be semicolon if (s == token_semi) { include(f); return next(); } else { push(s); push(f); return t; } } return t; } int TOKEN::nextint() { char *t = next(); char *e; long i = strtol(t, &e, 10); if (*e != '\0') { token_error("integer", t); return 0; } return (int)i; } void TOKEN::skipeol() { while (true) { u_char c; if (!next_char(c)) break; if (c == (u_char)'\n') break; } } void TOKEN::token_error(const char *err) { token_error(); printf("%s \n", err); } void TOKEN::token_error(const char *fmt, int d, const char *s) { printf(fmt, d, s); } void TOKEN::token_error(const char *fmt, const char *t, const char *h) { if (!h) h = "null"; printf(fmt, t, h); } void TOKEN::token_error(const char *want, const char *have) { token_error(); token_error("expecting %s, found %s \n", want, have); } void TOKEN::token_error() { token_error("syntax error at line %d in file %s -- ", cur_line(), cur_fn()); line_list::iterator j = linenumbers.begin(); string_list::iterator i = filenames.begin(); for (; i!=filenames.end(); i++,j++) { if (i != filenames.begin()) { char *fn = (*i); int li = (*j); token_error("\n included from line %d in file %s -- ", li, fn); } } }