# HG changeset patch # User carl # Date 1121027313 25200 # Node ID 2b369f7db7bf638d62dfb910590f1b5dfc514aca # Parent e6a2d0be7c5e0c34c38b94a4e98b33001c4e674d start coding on new config syntax diff -r e6a2d0be7c5e -r 2b369f7db7bf new.bash --- a/new.bash Sun Jul 10 13:28:33 2005 -0700 +++ b/new.bash Sun Jul 10 13:28:33 2005 -0700 @@ -3,13 +3,13 @@ ############################ ## compile and run the new parser program ## -rm -f new.o context.o tokenizer.o -g++ -c new.cpp context.cpp tokenizer.cpp +rm -f dnsbl.o scanner.o context.o tokenizer.o +g++ -c dnsbl.cpp scanner.cpp context.cpp tokenizer.cpp if [ $? -ne 0 ]; then echo "compiler errors" exit fi -g++ -o new new.o context.o tokenizer.o -pthread +g++ -o dnsbl dnsbl.o scanner.o context.o tokenizer.o -pthread if [ $? -ne 0 ]; then echo "linker errors" exit diff -r e6a2d0be7c5e -r 2b369f7db7bf src/context.cpp --- a/src/context.cpp Sun Jul 10 13:28:33 2005 -0700 +++ b/src/context.cpp Sun Jul 10 13:28:33 2005 -0700 @@ -8,7 +8,7 @@ #include "includes.h" -static char* context_version="$Id:"; +static char* context_version="$Id$"; char *token_black; char *token_content; diff -r e6a2d0be7c5e -r 2b369f7db7bf src/context.h --- a/src/context.h Sun Jul 10 13:28:33 2005 -0700 +++ b/src/context.h Sun Jul 10 13:28:33 2005 -0700 @@ -1,3 +1,6 @@ +#ifndef context_include +#define context_include + #include "tokenizer.h" #include @@ -11,6 +14,7 @@ class DNSBL; class CONTEXT; +class recorder; typedef map string_map; typedef set int_set; @@ -86,6 +90,19 @@ void add_dnsbl(DNSBLP dns) {dnsbl_list.push_back(dns);}; DNSBLP find_dnsbl(char *name); + int get_host_limit() {return host_limit;}; + bool get_host_random() {return host_random;}; + char* get_content_suffix() {return content_suffix;}; + char* get_content_message() {return content_message;}; + string_set& get_content_host_ignore() {return content_host_ignore;}; + string_set& get_content_tlds() {return content_tlds;}; + string_set& get_html_tags() {return html_tags;}; + dnsblp_list& get_dnsbl_list() {return dnsbl_list;}; + bool get_content_filtering() {return content_filtering;}; + + bool acceptable_content(recorder &memory); + bool ignore_host(char *host); + void dump(int level = 0); }; @@ -100,12 +117,20 @@ context_list contexts; // owns all the contexts, not just top level contexts context_map env_to; // map recipient to a filtering context CONTEXTP default_context;// for env_to values that don't have their own specific filtering context + // the default context is also used for some of the content filtering values CONFIG(); ~CONFIG(); void add_context(CONTEXTP con); void add_to(char *to, CONTEXTP con) {env_to[to] = con;}; CONTEXTP find_context(char *to, char *from); + + char* get_content_suffix() {return default_context->get_content_suffix() ;}; + char* get_content_message() {return default_context->get_content_message() ;}; + string_set& get_content_host_ignore() {return default_context->get_content_host_ignore() ;}; + string_set& get_content_tlds() {return default_context->get_content_tlds() ;}; + string_set& get_html_tags() {return default_context->get_html_tags() ;}; + void dump(); }; @@ -147,3 +172,5 @@ CONFIG *parse_config(char *fn); bool load_conf(CONFIG &dc, char *fn); void token_init(); + +#endif diff -r e6a2d0be7c5e -r 2b369f7db7bf src/dnsbl.cpp --- a/src/dnsbl.cpp Sun Jul 10 13:28:33 2005 -0700 +++ b/src/dnsbl.cpp Sun Jul 10 13:28:33 2005 -0700 @@ -74,7 +74,7 @@ #include "includes.h" -static char* dnsbl_version="$Id:"; +static char* dnsbl_version="$Id$"; extern "C" { diff -r e6a2d0be7c5e -r 2b369f7db7bf src/scanner.cpp --- a/src/scanner.cpp Sun Jul 10 13:28:33 2005 -0700 +++ b/src/scanner.cpp Sun Jul 10 13:28:33 2005 -0700 @@ -6,127 +6,9 @@ */ -static char* scanner_version="$Id$"; - -using namespace std; - - -// object to record things we see in the body content -struct recorder -{ - mlfiPriv *priv; // needed for syslog - string_set *html_tags; // valid tags - string_set *tlds; // valid tlds - string_set hosts; - int bad_html_tags; - int binary_tags; - recorder(mlfiPriv *priv_, string_set *html_tags_, string_set *tlds_); - ~recorder(); - void empty(); - void new_url(char *host); - void new_tag(char *tag); - void binary(); -}; -recorder::recorder(mlfiPriv *priv_, string_set *html_tags_, string_set *tlds_) { - priv = priv_; - html_tags = html_tags_; - tlds = tlds_; - bad_html_tags = 0; - binary_tags = 0; -} -recorder::~recorder() { - empty(); -} -void recorder::empty() { - bad_html_tags = 0; - binary_tags = 0; - discard(hosts); -} -void recorder::new_url(char *host) { - register_string(hosts, host); -} -void recorder::binary() { - binary_tags++; -} -void recorder::new_tag(char *tag) { - string_set::iterator i = html_tags->find(tag); - if (i == html_tags->end()) { - bad_html_tags++; - if (debug_syslog && (bad_html_tags < 10)) { - // only log the first 10 bad tags - char buf[200]; - snprintf(buf, sizeof(buf), "bad html tag %s", tag); - my_syslog(priv, buf); - } - } -} - - - -enum state {// host name recognizer states - h_init, - h_host, +#include "includes.h" - // html tag discarder states - t_init, - t_tag1, // seen opening < - t_tag2, // not comment - t_com1, // seen ! - t_com2, // seen first - - t_com3, // seen second -, looking for --> - t_com4, // seen first - - t_com5, // seen second - - t_disc, // looking for closing > - - // url recognizer states - u_init, - u_http, - u_sla, - u_url, - - // url decoder states %xx - d_init, - d_pcnt, - d_1, - - // html entity decoder states &#nnn; - e_init, - e_amp, - e_num, - - // mime decoder states =xx - m_init, - m_eq, - m_1, - - // base64 decoder states - b_init, - b_lf, - b_lf2, - b_64, - - // uuencoding decoder states - uu_init, - uu_lf, - uu_lf2, - uu_64, - - // counter for number of columns in the table - end_state, - - // temporary states - h_end, - t_bin, - t_end, - u_reco, - d_2, - e_semi, - m_2, - m_cr, - m_nl, - b_cr, - uu_cr - }; +static char* scanner_version="$Id$"; typedef state PARSE[end_state]; @@ -1169,22 +1051,48 @@ 0, // 0xff }; -#define PENDING_LIMIT 100 -struct fsa { - u_char pending[PENDING_LIMIT]; - int count; - state st; - state init; - fsa *next1; - fsa *next2; - recorder *memory; - fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_); - void push(u_char *buf, int len); - void pusher(); - void error(char *err); -}; +//////////////////////////////////////////////// +// +// +recorder::recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_) { + priv = priv_; + html_tags = &html_tags_; + tlds = &tlds_; + bad_html_tags = 0; + binary_tags = 0; +} +recorder::~recorder() { + empty(); +} +void recorder::empty() { + bad_html_tags = 0; + binary_tags = 0; + discard(hosts); +} +void recorder::new_url(char *host) { + register_string(hosts, host); +} +void recorder::binary() { + binary_tags++; +} +void recorder::new_tag(char *tag) { + string_set::iterator i = html_tags->find(tag); + if (i == html_tags->end()) { + bad_html_tags++; + if (debug_syslog && (bad_html_tags < 10)) { + // only log the first 10 bad tags + char buf[200]; + snprintf(buf, sizeof(buf), "bad html tag %s", tag); + my_syslog(priv, buf); + } + } +} + +//////////////////////////////////////////////// +// +// fsa::fsa(state init_, fsa *next1_, fsa *next2_, recorder *memory_) { count = 0; st = init_; @@ -1447,21 +1355,10 @@ } } -struct url_scanner { - fsa *host_parser; - fsa *tags_parser; - fsa *urls_parser; - fsa *urld_parser; - fsa *html_parser; - fsa *mime_parser; - fsa *b64_parser; - fsa *uu_parser; - url_scanner(recorder *memory); - ~url_scanner(); - void scan(u_char *buffer, size_t length); -}; - +//////////////////////////////////////////////// +// +// url_scanner::url_scanner(recorder *memory) { host_parser = new fsa(h_init, NULL, NULL, memory); tags_parser = new fsa(t_init, host_parser, NULL, memory); diff -r e6a2d0be7c5e -r 2b369f7db7bf src/tokenizer.cpp --- a/src/tokenizer.cpp Sun Jul 10 13:28:33 2005 -0700 +++ b/src/tokenizer.cpp Sun Jul 10 13:28:33 2005 -0700 @@ -8,7 +8,7 @@ #include "context.h" -static char* tokenizer_version="$Id:"; +static char* tokenizer_version="$Id$"; enum state {s_init, s_token, diff -r e6a2d0be7c5e -r 2b369f7db7bf src/tokenizer.h --- a/src/tokenizer.h Sun Jul 10 13:28:33 2005 -0700 +++ b/src/tokenizer.h Sun Jul 10 13:28:33 2005 -0700 @@ -1,3 +1,6 @@ +#ifndef tokenizer_include +#define tokenizer_include + #include #include #include @@ -47,3 +50,4 @@ void token_error(); }; +#endif