comparison src/scanner.h @ 117:aa07452e641b

uribl patch from Jeff Evans <jeffe@tricab.com>
author carl
date Sun, 12 Mar 2006 10:15:39 -0800
parents 81f1e400e8ab
children ecb40aa3eaa5
comparison
equal deleted inserted replaced
116:0094678a16d0 117:aa07452e641b
6 //////////////////////////////////////////////// 6 ////////////////////////////////////////////////
7 // memory for the content scanner 7 // memory for the content scanner
8 // 8 //
9 class recorder 9 class recorder
10 { 10 {
11 mlfiPriv *priv; // needed for syslog 11 mlfiPriv *priv; // needed for syslog
12 string_set *html_tags; // valid tags 12 string_set *html_tags; // valid tags
13 string_set *tlds; // valid tlds 13 string_set *tlds; // valid tlds
14 string_set hosts; 14 string_set *cctlds; // valid cctlds
15 int bad_html_tags; 15 string_set hosts;
16 int binary_tags; 16 int bad_html_tags;
17 int binary_tags;
17 18
18 public: 19 public:
19 recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_); 20 recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_, string_set &cctlds_);
20 ~recorder() { empty(); }; 21 ~recorder() { empty(); };
21 void empty(); 22 void empty();
22 void new_url(char *host); 23 void new_url(char *host);
23 void new_tag(char *tag); 24 void new_tag(char *tag);
24 void binary(); 25 void binary();
25 mlfiPriv *get_priv() { return priv; }; 26 mlfiPriv *get_priv() { return priv; };
26 string_set *get_tlds() { return tlds; }; 27 string_set *get_cctlds() { return cctlds; };
27 string_set &get_hosts() { return hosts; }; 28 string_set *get_tlds() { return tlds; };
28 bool excessive_bad_tags(int limit) { return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); }; 29 string_set &get_hosts() { return hosts; };
29 bool excessive_hosts(int limit) { return (limit > 0) && (hosts.size() > limit); }; 30 bool excessive_bad_tags(int limit) { return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };
31 bool excessive_hosts(int limit) { return (limit > 0) && (hosts.size() > limit); };
30 }; 32 };
31 33
32 34
33 //////////////////////////////////////////////// 35 ////////////////////////////////////////////////
34 // the content scanner 36 // the content scanner
35 // 37 //
36 class fsa; 38 class fsa;
37 class url_scanner { 39 class url_scanner {
38 fsa *host_parser; 40 fsa *host_parser;
39 fsa *tags_parser; 41 fsa *tags_parser;
40 fsa *urls_parser; 42 fsa *urls_parser;
41 fsa *urld_parser; 43 fsa *urld_parser;
42 fsa *html_parser; 44 fsa *html_parser;
43 fsa *mime_parser; 45 fsa *mime_parser;
44 fsa *b64_parser; 46 fsa *b64_parser;
45 fsa *uu_parser; 47 fsa *uu_parser;
46 48
47 public: 49 public:
48 url_scanner(recorder *memory); 50 url_scanner(recorder *memory);
49 ~url_scanner(); 51 ~url_scanner();
50 void scan(u_char *buffer, size_t length); 52 void scan(u_char *buffer, size_t length);
51 }; 53 };
52 54
53 #endif 55 #endif