annotate src/scanner.h @ 74:b7449114ebb0

start coding on new config syntax
author carl
date Sun, 10 Jul 2005 14:19:00 -0700
parents
children 1142e46be550
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
74
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
1 #ifndef scanner_include
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
2 #define scanner_include
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
3
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
4 #include "dnsbl.h"
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
5
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
6 ////////////////////////////////////////////////
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
7 // memory for the content scanner
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
8 //
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
9 class recorder
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
10 {
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
11 mlfiPriv *priv; // needed for syslog
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
12 string_set *html_tags; // valid tags
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
13 string_set *tlds; // valid tlds
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
14 string_set hosts;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
15 int bad_html_tags;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
16 int binary_tags;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
17
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
18 public:
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
19 recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_);
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
20 ~recorder();
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
21 void empty();
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
22 void new_url(char *host);
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
23 void new_tag(char *tag);
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
24 void binary();
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
25 mlfiPriv *get_priv() {return priv; };
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
26 string_set *get_tlds() {return tlds; };
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
27 string_set &get_hosts() {return hosts; };
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
28 bool excessive_bad_tags(int limit) {return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
29 bool excessive_hosts(int limit) {return (limit > 0) && (hosts.size() > limit); };
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
30 };
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
31
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
32
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
33 ////////////////////////////////////////////////
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
34 // finite state machine
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
35 //
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
36 enum state {// host name recognizer states
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
37 h_init,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
38 h_host,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
39
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
40 // html tag discarder states
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
41 t_init,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
42 t_tag1, // seen opening <
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
43 t_tag2, // not comment
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
44 t_com1, // seen !
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
45 t_com2, // seen first -
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
46 t_com3, // seen second -, looking for -->
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
47 t_com4, // seen first -
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
48 t_com5, // seen second -
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
49 t_disc, // looking for closing >
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
50
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
51 // url recognizer states
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
52 u_init,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
53 u_http,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
54 u_sla,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
55 u_url,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
56
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
57 // url decoder states %xx
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
58 d_init,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
59 d_pcnt,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
60 d_1,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
61
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
62 // html entity decoder states &#nnn;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
63 e_init,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
64 e_amp,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
65 e_num,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
66
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
67 // mime decoder states =xx
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
68 m_init,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
69 m_eq,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
70 m_1,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
71
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
72 // base64 decoder states
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
73 b_init,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
74 b_lf,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
75 b_lf2,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
76 b_64,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
77
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
78 // uuencoding decoder states
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
79 uu_init,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
80 uu_lf,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
81 uu_lf2,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
82 uu_64,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
83
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
84 // counter for number of columns in the table
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
85 end_state,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
86
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
87 // temporary states
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
88 h_end,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
89 t_bin,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
90 t_end,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
91 u_reco,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
92 d_2,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
93 e_semi,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
94 m_2,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
95 m_cr,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
96 m_nl,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
97 b_cr,
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
98 uu_cr
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
99 };
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
100
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
101 #define PENDING_LIMIT 100
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
102 class fsa {
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
103 u_char pending[PENDING_LIMIT];
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
104 int count;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
105 state st;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
106 state init;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
107 fsa *next1;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
108 fsa *next2;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
109 recorder *memory;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
110
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
111 public:
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
112 fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_);
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
113 void push(u_char *buf, int len);
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
114 void pusher();
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
115 void error(char *err);
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
116 };
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
117
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
118
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
119 ////////////////////////////////////////////////
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
120 // the content scanner
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
121 //
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
122 class url_scanner {
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
123 fsa *host_parser;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
124 fsa *tags_parser;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
125 fsa *urls_parser;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
126 fsa *urld_parser;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
127 fsa *html_parser;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
128 fsa *mime_parser;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
129 fsa *b64_parser;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
130 fsa *uu_parser;
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
131
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
132 public:
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
133 url_scanner(recorder *memory);
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
134 ~url_scanner();
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
135 void scan(u_char *buffer, size_t length);
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
136 };
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
137
b7449114ebb0 start coding on new config syntax
carl
parents:
diff changeset
138 #endif