annotate src/scanner.h @ 143:ecb40aa3eaa5 stable-5-23

require two periods for ip addresses
author carl
date Tue, 10 Oct 2006 19:12:16 -0700
parents aa07452e641b
children 812c80305f26
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
143
ecb40aa3eaa5 require two periods for ip addresses
carl
parents: 117
diff changeset
1 /*
ecb40aa3eaa5 require two periods for ip addresses
carl
parents: 117
diff changeset
2
ecb40aa3eaa5 require two periods for ip addresses
carl
parents: 117
diff changeset
3 Copyright (c) 2006 Carl Byington - 510 Software Group, released under
ecb40aa3eaa5 require two periods for ip addresses
carl
parents: 117
diff changeset
4 the GPL version 2 or any later version at your choice available at
ecb40aa3eaa5 require two periods for ip addresses
carl
parents: 117
diff changeset
5 http://www.fsf.org/licenses/gpl.txt
ecb40aa3eaa5 require two periods for ip addresses
carl
parents: 117
diff changeset
6
ecb40aa3eaa5 require two periods for ip addresses
carl
parents: 117
diff changeset
7 */
ecb40aa3eaa5 require two periods for ip addresses
carl
parents: 117
diff changeset
8
74
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
9 #ifndef scanner_include
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
10 #define scanner_include
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
11
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
12 #include "dnsbl.h"
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
13
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
14 ////////////////////////////////////////////////
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
15 // memory for the content scanner
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
16 //
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
17 class recorder
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
18 {
117
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
19 mlfiPriv *priv; // needed for syslog
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
20 string_set *html_tags; // valid tags
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
21 string_set *tlds; // valid tlds
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
22 string_set *cctlds; // valid cctlds
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
23 string_set hosts;
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
24 int bad_html_tags;
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
25 int binary_tags;
74
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
26
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
27 public:
117
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
28 recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_, string_set &cctlds_);
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
29 ~recorder() { empty(); };
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
30 void empty();
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
31 void new_url(char *host);
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
32 void new_tag(char *tag);
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
33 void binary();
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
34 mlfiPriv *get_priv() { return priv; };
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
35 string_set *get_cctlds() { return cctlds; };
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
36 string_set *get_tlds() { return tlds; };
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
37 string_set &get_hosts() { return hosts; };
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
38 bool excessive_bad_tags(int limit) { return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
39 bool excessive_hosts(int limit) { return (limit > 0) && (hosts.size() > limit); };
74
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
40 };
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
41
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
42
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
43 ////////////////////////////////////////////////
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
44 // the content scanner
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
45 //
75
1142e46be550 start coding on new config syntax
carl
parents: 74
diff changeset
46 class fsa;
74
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
47 class url_scanner {
117
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
48 fsa *host_parser;
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
49 fsa *tags_parser;
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
50 fsa *urls_parser;
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
51 fsa *urld_parser;
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
52 fsa *html_parser;
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
53 fsa *mime_parser;
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
54 fsa *b64_parser;
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
55 fsa *uu_parser;
74
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
56
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
57 public:
117
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
58 url_scanner(recorder *memory);
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
59 ~url_scanner();
aa07452e641b uribl patch from Jeff Evans <jeffe@tricab.com>
carl
parents: 76
diff changeset
60 void scan(u_char *buffer, size_t length);
74
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
61 };
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
62
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
63 #endif