view src/scanner.h @ 98:91c27c00048f

tokenizer errors now go thru syslog to be visible during config file reloads in normal operation
author carl
date Thu, 22 Sep 2005 21:57:08 -0700
parents 81f1e400e8ab
children aa07452e641b
line wrap: on
line source

#ifndef scanner_include
#define scanner_include

#include "dnsbl.h"

////////////////////////////////////////////////
// memory for the content scanner
//
class recorder
{
    mlfiPriv    *priv;      // needed for syslog
    string_set  *html_tags; // valid tags
    string_set  *tlds;      // valid tlds
    string_set  hosts;
    int         bad_html_tags;
    int         binary_tags;

public:
    recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_);
    ~recorder()                                 { empty(); };
    void empty();
    void new_url(char *host);
    void new_tag(char *tag);
    void binary();
    mlfiPriv   *get_priv()                      { return priv;                                                                      };
    string_set *get_tlds()                      { return tlds;                                                                      };
    string_set &get_hosts()                     { return hosts;                                                                     };
    bool        excessive_bad_tags(int limit)   { return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };
    bool        excessive_hosts(int limit)      { return (limit > 0) && (hosts.size() > limit);                                     };
};


////////////////////////////////////////////////
// the content scanner
//
class fsa;
class url_scanner {
    fsa *host_parser;
    fsa *tags_parser;
    fsa *urls_parser;
    fsa *urld_parser;
    fsa *html_parser;
    fsa *mime_parser;
    fsa *b64_parser;
    fsa *uu_parser;

public:
    url_scanner(recorder *memory);
    ~url_scanner();
    void scan(u_char *buffer, size_t length);
};

#endif