comparison src/scanner.h @ 74:b7449114ebb0

start coding on new config syntax
author carl
date Sun, 10 Jul 2005 14:19:00 -0700
parents
children 1142e46be550
comparison
equal deleted inserted replaced
73:2b369f7db7bf 74:b7449114ebb0
1 #ifndef scanner_include
2 #define scanner_include
3
4 #include "dnsbl.h"
5
6 ////////////////////////////////////////////////
7 // memory for the content scanner
8 //
9 class recorder
10 {
11 mlfiPriv *priv; // needed for syslog
12 string_set *html_tags; // valid tags
13 string_set *tlds; // valid tlds
14 string_set hosts;
15 int bad_html_tags;
16 int binary_tags;
17
18 public:
19 recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_);
20 ~recorder();
21 void empty();
22 void new_url(char *host);
23 void new_tag(char *tag);
24 void binary();
25 mlfiPriv *get_priv() {return priv; };
26 string_set *get_tlds() {return tlds; };
27 string_set &get_hosts() {return hosts; };
28 bool excessive_bad_tags(int limit) {return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };
29 bool excessive_hosts(int limit) {return (limit > 0) && (hosts.size() > limit); };
30 };
31
32
33 ////////////////////////////////////////////////
34 // finite state machine
35 //
36 enum state {// host name recognizer states
37 h_init,
38 h_host,
39
40 // html tag discarder states
41 t_init,
42 t_tag1, // seen opening <
43 t_tag2, // not comment
44 t_com1, // seen !
45 t_com2, // seen first -
46 t_com3, // seen second -, looking for -->
47 t_com4, // seen first -
48 t_com5, // seen second -
49 t_disc, // looking for closing >
50
51 // url recognizer states
52 u_init,
53 u_http,
54 u_sla,
55 u_url,
56
57 // url decoder states %xx
58 d_init,
59 d_pcnt,
60 d_1,
61
62 // html entity decoder states &#nnn;
63 e_init,
64 e_amp,
65 e_num,
66
67 // mime decoder states =xx
68 m_init,
69 m_eq,
70 m_1,
71
72 // base64 decoder states
73 b_init,
74 b_lf,
75 b_lf2,
76 b_64,
77
78 // uuencoding decoder states
79 uu_init,
80 uu_lf,
81 uu_lf2,
82 uu_64,
83
84 // counter for number of columns in the table
85 end_state,
86
87 // temporary states
88 h_end,
89 t_bin,
90 t_end,
91 u_reco,
92 d_2,
93 e_semi,
94 m_2,
95 m_cr,
96 m_nl,
97 b_cr,
98 uu_cr
99 };
100
101 #define PENDING_LIMIT 100
102 class fsa {
103 u_char pending[PENDING_LIMIT];
104 int count;
105 state st;
106 state init;
107 fsa *next1;
108 fsa *next2;
109 recorder *memory;
110
111 public:
112 fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_);
113 void push(u_char *buf, int len);
114 void pusher();
115 void error(char *err);
116 };
117
118
119 ////////////////////////////////////////////////
120 // the content scanner
121 //
122 class url_scanner {
123 fsa *host_parser;
124 fsa *tags_parser;
125 fsa *urls_parser;
126 fsa *urld_parser;
127 fsa *html_parser;
128 fsa *mime_parser;
129 fsa *b64_parser;
130 fsa *uu_parser;
131
132 public:
133 url_scanner(recorder *memory);
134 ~url_scanner();
135 void scan(u_char *buffer, size_t length);
136 };
137
138 #endif