view src/scanner.h @ 74:b7449114ebb0

start coding on new config syntax
author carl
date Sun, 10 Jul 2005 14:19:00 -0700
parents
children 1142e46be550
line wrap: on
line source

#ifndef scanner_include
#define scanner_include

#include "dnsbl.h"

////////////////////////////////////////////////
// memory for the content scanner
//
class recorder
{
    mlfiPriv    *priv;      // needed for syslog
    string_set  *html_tags; // valid tags
    string_set  *tlds;      // valid tlds
    string_set  hosts;
    int         bad_html_tags;
    int         binary_tags;

public:
    recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_);
    ~recorder();
    void empty();
    void new_url(char *host);
    void new_tag(char *tag);
    void binary();
    mlfiPriv   *get_priv()                      {return priv;                                                                      };
    string_set *get_tlds()                      {return tlds;                                                                      };
    string_set &get_hosts()                     {return hosts;                                                                     };
    bool        excessive_bad_tags(int limit)   {return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };
    bool        excessive_hosts(int limit)      {return (limit > 0) && (hosts.size() > limit);                                     };
};


////////////////////////////////////////////////
// finite state machine
//
enum state {// host name recognizer states
            h_init,
            h_host,

            // html tag discarder states
            t_init,
            t_tag1,     // seen opening <
            t_tag2,     // not comment
            t_com1,     // seen !
            t_com2,     // seen first -
            t_com3,     // seen second -, looking for -->
            t_com4,     // seen first -
            t_com5,     // seen second -
            t_disc,     // looking for closing >

            // url recognizer states
            u_init,
            u_http,
            u_sla,
            u_url,

            // url decoder states  %xx
            d_init,
            d_pcnt,
            d_1,

            // html entity decoder states &#nnn;
            e_init,
            e_amp,
            e_num,

            // mime decoder states =xx
            m_init,
            m_eq,
            m_1,

            // base64 decoder states
            b_init,
            b_lf,
            b_lf2,
            b_64,

            // uuencoding decoder states
            uu_init,
            uu_lf,
            uu_lf2,
            uu_64,

            // counter for number of columns in the table
            end_state,

            // temporary states
            h_end,
            t_bin,
            t_end,
            u_reco,
            d_2,
            e_semi,
            m_2,
            m_cr,
            m_nl,
            b_cr,
            uu_cr
           };

#define PENDING_LIMIT 100
class fsa {
    u_char      pending[PENDING_LIMIT];
    int         count;
    state       st;
    state       init;
    fsa         *next1;
    fsa         *next2;
    recorder    *memory;

public:
    fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_);
    void push(u_char *buf, int len);
    void pusher();
    void error(char *err);
};


////////////////////////////////////////////////
// the content scanner
//
class url_scanner {
    fsa *host_parser;
    fsa *tags_parser;
    fsa *urls_parser;
    fsa *urld_parser;
    fsa *html_parser;
    fsa *mime_parser;
    fsa *b64_parser;
    fsa *uu_parser;

public:
    url_scanner(recorder *memory);
    ~url_scanner();
    void scan(u_char *buffer, size_t length);
};

#endif