Mercurial > dnsbl
comparison src/scanner.cpp @ 73:2b369f7db7bf
start coding on new config syntax
author | carl |
---|---|
date | Sun, 10 Jul 2005 13:28:33 -0700 |
parents | fb8afa205293 |
children | b7449114ebb0 |
comparison
equal
deleted
inserted
replaced
72:e6a2d0be7c5e | 73:2b369f7db7bf |
---|---|
4 the GPL version 2 or any later version at your choice available at | 4 the GPL version 2 or any later version at your choice available at |
5 http://www.fsf.org/licenses/gpl.txt | 5 http://www.fsf.org/licenses/gpl.txt |
6 | 6 |
7 */ | 7 */ |
8 | 8 |
9 #include "includes.h" | |
10 | |
9 static char* scanner_version="$Id$"; | 11 static char* scanner_version="$Id$"; |
10 | |
11 using namespace std; | |
12 | |
13 | |
14 // object to record things we see in the body content | |
15 struct recorder | |
16 { | |
17 mlfiPriv *priv; // needed for syslog | |
18 string_set *html_tags; // valid tags | |
19 string_set *tlds; // valid tlds | |
20 string_set hosts; | |
21 int bad_html_tags; | |
22 int binary_tags; | |
23 recorder(mlfiPriv *priv_, string_set *html_tags_, string_set *tlds_); | |
24 ~recorder(); | |
25 void empty(); | |
26 void new_url(char *host); | |
27 void new_tag(char *tag); | |
28 void binary(); | |
29 }; | |
30 recorder::recorder(mlfiPriv *priv_, string_set *html_tags_, string_set *tlds_) { | |
31 priv = priv_; | |
32 html_tags = html_tags_; | |
33 tlds = tlds_; | |
34 bad_html_tags = 0; | |
35 binary_tags = 0; | |
36 } | |
37 recorder::~recorder() { | |
38 empty(); | |
39 } | |
40 void recorder::empty() { | |
41 bad_html_tags = 0; | |
42 binary_tags = 0; | |
43 discard(hosts); | |
44 } | |
45 void recorder::new_url(char *host) { | |
46 register_string(hosts, host); | |
47 } | |
48 void recorder::binary() { | |
49 binary_tags++; | |
50 } | |
51 void recorder::new_tag(char *tag) { | |
52 string_set::iterator i = html_tags->find(tag); | |
53 if (i == html_tags->end()) { | |
54 bad_html_tags++; | |
55 if (debug_syslog && (bad_html_tags < 10)) { | |
56 // only log the first 10 bad tags | |
57 char buf[200]; | |
58 snprintf(buf, sizeof(buf), "bad html tag %s", tag); | |
59 my_syslog(priv, buf); | |
60 } | |
61 } | |
62 } | |
63 | |
64 | |
65 | |
66 enum state {// host name recognizer states | |
67 h_init, | |
68 h_host, | |
69 | |
70 // html tag discarder states | |
71 t_init, | |
72 t_tag1, // seen opening < | |
73 t_tag2, // not comment | |
74 t_com1, // seen ! | |
75 t_com2, // seen first - | |
76 t_com3, // seen second -, looking for --> | |
77 t_com4, // seen first - | |
78 t_com5, // seen second - | |
79 t_disc, // looking for closing > | |
80 | |
81 // url recognizer states | |
82 u_init, | |
83 u_http, | |
84 u_sla, | |
85 u_url, | |
86 | |
87 // url decoder states %xx | |
88 d_init, | |
89 d_pcnt, | |
90 d_1, | |
91 | |
92 // html entity decoder states &#nnn; | |
93 e_init, | |
94 e_amp, | |
95 e_num, | |
96 | |
97 // mime decoder states =xx | |
98 m_init, | |
99 m_eq, | |
100 m_1, | |
101 | |
102 // base64 decoder states | |
103 b_init, | |
104 b_lf, | |
105 b_lf2, | |
106 b_64, | |
107 | |
108 // uuencoding decoder states | |
109 uu_init, | |
110 uu_lf, | |
111 uu_lf2, | |
112 uu_64, | |
113 | |
114 // counter for number of columns in the table | |
115 end_state, | |
116 | |
117 // temporary states | |
118 h_end, | |
119 t_bin, | |
120 t_end, | |
121 u_reco, | |
122 d_2, | |
123 e_semi, | |
124 m_2, | |
125 m_cr, | |
126 m_nl, | |
127 b_cr, | |
128 uu_cr | |
129 }; | |
130 | 12 |
131 typedef state PARSE[end_state]; | 13 typedef state PARSE[end_state]; |
132 | 14 |
133 static PARSE parse_table[256] = { | 15 static PARSE parse_table[256] = { |
134 // h_init, h_host, t_init, t_tag1, t_tag2, t_com1, t_com2, t_com3, t_com4, t_com5, t_disc, u_init, u_http, u_sla , u_url, d_init, d_pcnt, d_1, e_init, e_amp, e_num, m_init, m_eq, m_1, b_init, b_lf, b_lf2, b_64 uu_init, uu_lf, uu_lf2, uu_64 | 16 // h_init, h_host, t_init, t_tag1, t_tag2, t_com1, t_com2, t_com3, t_com4, t_com5, t_disc, u_init, u_http, u_sla , u_url, d_init, d_pcnt, d_1, e_init, e_amp, e_num, m_init, m_eq, m_1, b_init, b_lf, b_lf2, b_64 uu_init, uu_lf, uu_lf2, uu_64 |
1167 0, // 0xfd | 1049 0, // 0xfd |
1168 0, // 0xfe | 1050 0, // 0xfe |
1169 0, // 0xff | 1051 0, // 0xff |
1170 }; | 1052 }; |
1171 | 1053 |
1172 #define PENDING_LIMIT 100 | 1054 |
1173 struct fsa { | 1055 //////////////////////////////////////////////// |
1174 u_char pending[PENDING_LIMIT]; | 1056 // |
1175 int count; | 1057 // |
1176 state st; | 1058 recorder::recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_) { |
1177 state init; | 1059 priv = priv_; |
1178 fsa *next1; | 1060 html_tags = &html_tags_; |
1179 fsa *next2; | 1061 tlds = &tlds_; |
1180 recorder *memory; | 1062 bad_html_tags = 0; |
1181 | 1063 binary_tags = 0; |
1182 fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_); | 1064 } |
1183 void push(u_char *buf, int len); | 1065 recorder::~recorder() { |
1184 void pusher(); | 1066 empty(); |
1185 void error(char *err); | 1067 } |
1186 }; | 1068 void recorder::empty() { |
1187 | 1069 bad_html_tags = 0; |
1070 binary_tags = 0; | |
1071 discard(hosts); | |
1072 } | |
1073 void recorder::new_url(char *host) { | |
1074 register_string(hosts, host); | |
1075 } | |
1076 void recorder::binary() { | |
1077 binary_tags++; | |
1078 } | |
1079 void recorder::new_tag(char *tag) { | |
1080 string_set::iterator i = html_tags->find(tag); | |
1081 if (i == html_tags->end()) { | |
1082 bad_html_tags++; | |
1083 if (debug_syslog && (bad_html_tags < 10)) { | |
1084 // only log the first 10 bad tags | |
1085 char buf[200]; | |
1086 snprintf(buf, sizeof(buf), "bad html tag %s", tag); | |
1087 my_syslog(priv, buf); | |
1088 } | |
1089 } | |
1090 } | |
1091 | |
1092 | |
1093 //////////////////////////////////////////////// | |
1094 // | |
1095 // | |
1188 fsa::fsa(state init_, fsa *next1_, fsa *next2_, recorder *memory_) { | 1096 fsa::fsa(state init_, fsa *next1_, fsa *next2_, recorder *memory_) { |
1189 count = 0; | 1097 count = 0; |
1190 st = init_; | 1098 st = init_; |
1191 init = init_; | 1099 init = init_; |
1192 next1 = next1_; | 1100 next1 = next1_; |
1445 } break; | 1353 } break; |
1446 } | 1354 } |
1447 } | 1355 } |
1448 } | 1356 } |
1449 | 1357 |
1450 struct url_scanner { | 1358 |
1451 fsa *host_parser; | 1359 //////////////////////////////////////////////// |
1452 fsa *tags_parser; | 1360 // |
1453 fsa *urls_parser; | 1361 // |
1454 fsa *urld_parser; | |
1455 fsa *html_parser; | |
1456 fsa *mime_parser; | |
1457 fsa *b64_parser; | |
1458 fsa *uu_parser; | |
1459 | |
1460 url_scanner(recorder *memory); | |
1461 ~url_scanner(); | |
1462 void scan(u_char *buffer, size_t length); | |
1463 }; | |
1464 | |
1465 url_scanner::url_scanner(recorder *memory) { | 1362 url_scanner::url_scanner(recorder *memory) { |
1466 host_parser = new fsa(h_init, NULL, NULL, memory); | 1363 host_parser = new fsa(h_init, NULL, NULL, memory); |
1467 tags_parser = new fsa(t_init, host_parser, NULL, memory); | 1364 tags_parser = new fsa(t_init, host_parser, NULL, memory); |
1468 urls_parser = new fsa(u_init, NULL, NULL, memory); | 1365 urls_parser = new fsa(u_init, NULL, NULL, memory); |
1469 urld_parser = new fsa(d_init, urls_parser, tags_parser, memory); | 1366 urld_parser = new fsa(d_init, urls_parser, tags_parser, memory); |