comparison src/scanner.cpp @ 73:2b369f7db7bf

start coding on new config syntax
author carl
date Sun, 10 Jul 2005 13:28:33 -0700
parents fb8afa205293
children b7449114ebb0
comparison
equal deleted inserted replaced
72:e6a2d0be7c5e 73:2b369f7db7bf
4 the GPL version 2 or any later version at your choice available at 4 the GPL version 2 or any later version at your choice available at
5 http://www.fsf.org/licenses/gpl.txt 5 http://www.fsf.org/licenses/gpl.txt
6 6
7 */ 7 */
8 8
9 #include "includes.h"
10
9 static char* scanner_version="$Id$"; 11 static char* scanner_version="$Id$";
10
11 using namespace std;
12
13
14 // object to record things we see in the body content
15 struct recorder
16 {
17 mlfiPriv *priv; // needed for syslog
18 string_set *html_tags; // valid tags
19 string_set *tlds; // valid tlds
20 string_set hosts;
21 int bad_html_tags;
22 int binary_tags;
23 recorder(mlfiPriv *priv_, string_set *html_tags_, string_set *tlds_);
24 ~recorder();
25 void empty();
26 void new_url(char *host);
27 void new_tag(char *tag);
28 void binary();
29 };
30 recorder::recorder(mlfiPriv *priv_, string_set *html_tags_, string_set *tlds_) {
31 priv = priv_;
32 html_tags = html_tags_;
33 tlds = tlds_;
34 bad_html_tags = 0;
35 binary_tags = 0;
36 }
37 recorder::~recorder() {
38 empty();
39 }
40 void recorder::empty() {
41 bad_html_tags = 0;
42 binary_tags = 0;
43 discard(hosts);
44 }
45 void recorder::new_url(char *host) {
46 register_string(hosts, host);
47 }
48 void recorder::binary() {
49 binary_tags++;
50 }
51 void recorder::new_tag(char *tag) {
52 string_set::iterator i = html_tags->find(tag);
53 if (i == html_tags->end()) {
54 bad_html_tags++;
55 if (debug_syslog && (bad_html_tags < 10)) {
56 // only log the first 10 bad tags
57 char buf[200];
58 snprintf(buf, sizeof(buf), "bad html tag %s", tag);
59 my_syslog(priv, buf);
60 }
61 }
62 }
63
64
65
66 enum state {// host name recognizer states
67 h_init,
68 h_host,
69
70 // html tag discarder states
71 t_init,
72 t_tag1, // seen opening <
73 t_tag2, // not comment
74 t_com1, // seen !
75 t_com2, // seen first -
76 t_com3, // seen second -, looking for -->
77 t_com4, // seen first -
78 t_com5, // seen second -
79 t_disc, // looking for closing >
80
81 // url recognizer states
82 u_init,
83 u_http,
84 u_sla,
85 u_url,
86
87 // url decoder states %xx
88 d_init,
89 d_pcnt,
90 d_1,
91
92 // html entity decoder states &#nnn;
93 e_init,
94 e_amp,
95 e_num,
96
97 // mime decoder states =xx
98 m_init,
99 m_eq,
100 m_1,
101
102 // base64 decoder states
103 b_init,
104 b_lf,
105 b_lf2,
106 b_64,
107
108 // uuencoding decoder states
109 uu_init,
110 uu_lf,
111 uu_lf2,
112 uu_64,
113
114 // counter for number of columns in the table
115 end_state,
116
117 // temporary states
118 h_end,
119 t_bin,
120 t_end,
121 u_reco,
122 d_2,
123 e_semi,
124 m_2,
125 m_cr,
126 m_nl,
127 b_cr,
128 uu_cr
129 };
130 12
131 typedef state PARSE[end_state]; 13 typedef state PARSE[end_state];
132 14
133 static PARSE parse_table[256] = { 15 static PARSE parse_table[256] = {
134 // h_init, h_host, t_init, t_tag1, t_tag2, t_com1, t_com2, t_com3, t_com4, t_com5, t_disc, u_init, u_http, u_sla , u_url, d_init, d_pcnt, d_1, e_init, e_amp, e_num, m_init, m_eq, m_1, b_init, b_lf, b_lf2, b_64 uu_init, uu_lf, uu_lf2, uu_64 16 // h_init, h_host, t_init, t_tag1, t_tag2, t_com1, t_com2, t_com3, t_com4, t_com5, t_disc, u_init, u_http, u_sla , u_url, d_init, d_pcnt, d_1, e_init, e_amp, e_num, m_init, m_eq, m_1, b_init, b_lf, b_lf2, b_64 uu_init, uu_lf, uu_lf2, uu_64
1167 0, // 0xfd 1049 0, // 0xfd
1168 0, // 0xfe 1050 0, // 0xfe
1169 0, // 0xff 1051 0, // 0xff
1170 }; 1052 };
1171 1053
1172 #define PENDING_LIMIT 100 1054
1173 struct fsa { 1055 ////////////////////////////////////////////////
1174 u_char pending[PENDING_LIMIT]; 1056 //
1175 int count; 1057 //
1176 state st; 1058 recorder::recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_) {
1177 state init; 1059 priv = priv_;
1178 fsa *next1; 1060 html_tags = &html_tags_;
1179 fsa *next2; 1061 tlds = &tlds_;
1180 recorder *memory; 1062 bad_html_tags = 0;
1181 1063 binary_tags = 0;
1182 fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_); 1064 }
1183 void push(u_char *buf, int len); 1065 recorder::~recorder() {
1184 void pusher(); 1066 empty();
1185 void error(char *err); 1067 }
1186 }; 1068 void recorder::empty() {
1187 1069 bad_html_tags = 0;
1070 binary_tags = 0;
1071 discard(hosts);
1072 }
1073 void recorder::new_url(char *host) {
1074 register_string(hosts, host);
1075 }
1076 void recorder::binary() {
1077 binary_tags++;
1078 }
1079 void recorder::new_tag(char *tag) {
1080 string_set::iterator i = html_tags->find(tag);
1081 if (i == html_tags->end()) {
1082 bad_html_tags++;
1083 if (debug_syslog && (bad_html_tags < 10)) {
1084 // only log the first 10 bad tags
1085 char buf[200];
1086 snprintf(buf, sizeof(buf), "bad html tag %s", tag);
1087 my_syslog(priv, buf);
1088 }
1089 }
1090 }
1091
1092
1093 ////////////////////////////////////////////////
1094 //
1095 //
1188 fsa::fsa(state init_, fsa *next1_, fsa *next2_, recorder *memory_) { 1096 fsa::fsa(state init_, fsa *next1_, fsa *next2_, recorder *memory_) {
1189 count = 0; 1097 count = 0;
1190 st = init_; 1098 st = init_;
1191 init = init_; 1099 init = init_;
1192 next1 = next1_; 1100 next1 = next1_;
1445 } break; 1353 } break;
1446 } 1354 }
1447 } 1355 }
1448 } 1356 }
1449 1357
1450 struct url_scanner { 1358
1451 fsa *host_parser; 1359 ////////////////////////////////////////////////
1452 fsa *tags_parser; 1360 //
1453 fsa *urls_parser; 1361 //
1454 fsa *urld_parser;
1455 fsa *html_parser;
1456 fsa *mime_parser;
1457 fsa *b64_parser;
1458 fsa *uu_parser;
1459
1460 url_scanner(recorder *memory);
1461 ~url_scanner();
1462 void scan(u_char *buffer, size_t length);
1463 };
1464
1465 url_scanner::url_scanner(recorder *memory) { 1362 url_scanner::url_scanner(recorder *memory) {
1466 host_parser = new fsa(h_init, NULL, NULL, memory); 1363 host_parser = new fsa(h_init, NULL, NULL, memory);
1467 tags_parser = new fsa(t_init, host_parser, NULL, memory); 1364 tags_parser = new fsa(t_init, host_parser, NULL, memory);
1468 urls_parser = new fsa(u_init, NULL, NULL, memory); 1365 urls_parser = new fsa(u_init, NULL, NULL, memory);
1469 urld_parser = new fsa(d_init, urls_parser, tags_parser, memory); 1366 urld_parser = new fsa(d_init, urls_parser, tags_parser, memory);