74
|
1 #ifndef scanner_include
|
|
2 #define scanner_include
|
|
3
|
|
4 #include "dnsbl.h"
|
|
5
|
|
6 ////////////////////////////////////////////////
|
|
7 // memory for the content scanner
|
|
8 //
|
|
9 class recorder
|
|
10 {
|
|
11 mlfiPriv *priv; // needed for syslog
|
|
12 string_set *html_tags; // valid tags
|
|
13 string_set *tlds; // valid tlds
|
|
14 string_set hosts;
|
|
15 int bad_html_tags;
|
|
16 int binary_tags;
|
|
17
|
|
18 public:
|
|
19 recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_);
|
|
20 ~recorder();
|
|
21 void empty();
|
|
22 void new_url(char *host);
|
|
23 void new_tag(char *tag);
|
|
24 void binary();
|
|
25 mlfiPriv *get_priv() {return priv; };
|
|
26 string_set *get_tlds() {return tlds; };
|
|
27 string_set &get_hosts() {return hosts; };
|
|
28 bool excessive_bad_tags(int limit) {return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };
|
|
29 bool excessive_hosts(int limit) {return (limit > 0) && (hosts.size() > limit); };
|
|
30 };
|
|
31
|
|
32
|
|
33 ////////////////////////////////////////////////
|
|
34 // finite state machine
|
|
35 //
|
|
36 enum state {// host name recognizer states
|
|
37 h_init,
|
|
38 h_host,
|
|
39
|
|
40 // html tag discarder states
|
|
41 t_init,
|
|
42 t_tag1, // seen opening <
|
|
43 t_tag2, // not comment
|
|
44 t_com1, // seen !
|
|
45 t_com2, // seen first -
|
|
46 t_com3, // seen second -, looking for -->
|
|
47 t_com4, // seen first -
|
|
48 t_com5, // seen second -
|
|
49 t_disc, // looking for closing >
|
|
50
|
|
51 // url recognizer states
|
|
52 u_init,
|
|
53 u_http,
|
|
54 u_sla,
|
|
55 u_url,
|
|
56
|
|
57 // url decoder states %xx
|
|
58 d_init,
|
|
59 d_pcnt,
|
|
60 d_1,
|
|
61
|
|
62 // html entity decoder states &#nnn;
|
|
63 e_init,
|
|
64 e_amp,
|
|
65 e_num,
|
|
66
|
|
67 // mime decoder states =xx
|
|
68 m_init,
|
|
69 m_eq,
|
|
70 m_1,
|
|
71
|
|
72 // base64 decoder states
|
|
73 b_init,
|
|
74 b_lf,
|
|
75 b_lf2,
|
|
76 b_64,
|
|
77
|
|
78 // uuencoding decoder states
|
|
79 uu_init,
|
|
80 uu_lf,
|
|
81 uu_lf2,
|
|
82 uu_64,
|
|
83
|
|
84 // counter for number of columns in the table
|
|
85 end_state,
|
|
86
|
|
87 // temporary states
|
|
88 h_end,
|
|
89 t_bin,
|
|
90 t_end,
|
|
91 u_reco,
|
|
92 d_2,
|
|
93 e_semi,
|
|
94 m_2,
|
|
95 m_cr,
|
|
96 m_nl,
|
|
97 b_cr,
|
|
98 uu_cr
|
|
99 };
|
|
100
|
|
101 #define PENDING_LIMIT 100
|
|
102 class fsa {
|
|
103 u_char pending[PENDING_LIMIT];
|
|
104 int count;
|
|
105 state st;
|
|
106 state init;
|
|
107 fsa *next1;
|
|
108 fsa *next2;
|
|
109 recorder *memory;
|
|
110
|
|
111 public:
|
|
112 fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_);
|
|
113 void push(u_char *buf, int len);
|
|
114 void pusher();
|
|
115 void error(char *err);
|
|
116 };
|
|
117
|
|
118
|
|
119 ////////////////////////////////////////////////
|
|
120 // the content scanner
|
|
121 //
|
|
122 class url_scanner {
|
|
123 fsa *host_parser;
|
|
124 fsa *tags_parser;
|
|
125 fsa *urls_parser;
|
|
126 fsa *urld_parser;
|
|
127 fsa *html_parser;
|
|
128 fsa *mime_parser;
|
|
129 fsa *b64_parser;
|
|
130 fsa *uu_parser;
|
|
131
|
|
132 public:
|
|
133 url_scanner(recorder *memory);
|
|
134 ~url_scanner();
|
|
135 void scan(u_char *buffer, size_t length);
|
|
136 };
|
|
137
|
|
138 #endif
|