comparison src/scanner.cpp @ 147:812c80305f26 stable-5-26

fix 5.23 bug and add fsa debug logging
author carl
date Mon, 04 Dec 2006 21:49:09 -0800
parents ecb40aa3eaa5
children c7fc218686f5
comparison
equal deleted inserted replaced
146:7278c9766e26 147:812c80305f26
79 m_nl, 79 m_nl,
80 b_cr, 80 b_cr,
81 uu_cr 81 uu_cr
82 }; 82 };
83 83
84 static char* state_names[] = {"h_init",
85 "h_host",
86 "t_init",
87 "t_tag1",
88 "t_tag2",
89 "t_com1",
90 "t_com2",
91 "t_com3",
92 "t_com4",
93 "t_com5",
94 "t_disc",
95 "u_init",
96 "u_http",
97 "u_sla",
98 "u_url",
99 "d_init",
100 "d_pcnt",
101 "d_1",
102 "e_init",
103 "e_amp",
104 "e_num",
105 "m_init",
106 "m_eq",
107 "m_1",
108 "b_init",
109 "b_lf",
110 "b_lf2",
111 "b_64",
112 "uu_init",
113 "uu_lf",
114 "uu_lf2",
115 "uu_64",
116 "end_state",
117 "h_end",
118 "t_bin",
119 "t_end",
120 "u_reco",
121 "d_2",
122 "e_semi",
123 "m_2",
124 "m_cr",
125 "m_nl",
126 "b_cr",
127 "uu_cr"};
128
84 #define PENDING_LIMIT 100 129 #define PENDING_LIMIT 100
85 class fsa { 130 class fsa {
131 char *myname;
86 u_char pending[PENDING_LIMIT]; 132 u_char pending[PENDING_LIMIT];
87 int count; 133 int count;
88 state st; 134 state st;
89 state init; 135 state init;
90 fsa *next1; 136 fsa *next1;
91 fsa *next2; 137 fsa *next2;
92 recorder *memory; 138 recorder *memory;
93 139
94 public: 140 public:
95 fsa(state init, fsa *next1_, fsa *next2_, recorder *memory_); 141 fsa(char *myname_, state init, fsa *next1_, fsa *next2_, recorder *memory_);
96 void push(u_char *buf, int len); 142 void push(u_char *buf, int len);
97 void pusher(); 143 void pusher();
98 void validhost(); 144 void validhost();
99 void error(char *err); 145 void error(char *err);
100 }; 146 };
1179 1225
1180 1226
1181 //////////////////////////////////////////////// 1227 ////////////////////////////////////////////////
1182 // 1228 //
1183 // 1229 //
1184 fsa::fsa(state init_, fsa *next1_, fsa *next2_, recorder *memory_) { 1230 fsa::fsa(char *myname_, state init_, fsa *next1_, fsa *next2_, recorder *memory_) {
1231 myname = myname_;
1185 count = 0; 1232 count = 0;
1186 st = init_; 1233 st = init_;
1187 init = init_; 1234 init = init_;
1188 next1 = next1_; 1235 next1 = next1_;
1189 next2 = next2_; 1236 next2 = next2_;
1191 } 1238 }
1192 1239
1193 void fsa::error(char *err) { 1240 void fsa::error(char *err) {
1194 count = 0; 1241 count = 0;
1195 st = init; 1242 st = init;
1196 if (err) my_syslog(memory->get_priv(), err); 1243 if (err) memory->syslog(err);
1197 } 1244 }
1198 1245
1199 void fsa::pusher() { 1246 void fsa::pusher() {
1200 if (next1) next1->push(pending, count); 1247 if (next1) next1->push(pending, count);
1201 if (next2) next2->push(pending, count); 1248 if (next2) next2->push(pending, count);
1212 char *p2 = strrchr((const char *)pending, '.'); 1259 char *p2 = strrchr((const char *)pending, '.');
1213 char *p3 = strstr((const char *)pending, ".."); 1260 char *p3 = strstr((const char *)pending, "..");
1214 if (p1 && (p1 != (char*)pending) & !p3) { 1261 if (p1 && (p1 != (char*)pending) & !p3) {
1215 // have a period, so at least two components, and no empty components 1262 // have a period, so at least two components, and no empty components
1216 in_addr ip; 1263 in_addr ip;
1217 if (inet_aton((const char*)pending, &ip)) 1264 if (inet_aton((const char*)pending, &ip)) {
1218 // have an ip address if at least two periods 1265 // have an ip address if at least two periods
1219 if (p1 != p2) memory->new_url((char*)pending); 1266 if (p1 != p2) memory->new_url((char*)pending);
1267 }
1220 else { 1268 else {
1221 for (int i=0; i<count; i++) pending[i] = tolower(pending[i]); 1269 for (int i=0; i<count; i++) pending[i] = tolower(pending[i]);
1222 // is last component a tld? 1270 // is last component a tld?
1223 string_set::iterator i = memory->get_tlds()->find(p2+1); 1271 string_set::iterator i = memory->get_tlds()->find(p2+1);
1224 if (i != memory->get_tlds()->end()) memory->new_url((char*)pending); 1272 if (i != memory->get_tlds()->end()) memory->new_url((char*)pending);
1226 } 1274 }
1227 } 1275 }
1228 } 1276 }
1229 1277
1230 void fsa::push(u_char *buf, int len) { 1278 void fsa::push(u_char *buf, int len) {
1279 if (debug_syslog > 10) {
1280 char msg[200], mbuf[200];
1281 int n = sizeof(mbuf) - 1;
1282 if (len < n) n = len;
1283 memcpy(mbuf, buf, n);
1284 mbuf[n] = '\0';
1285 snprintf(msg, sizeof(msg), "%s sees %s", myname, mbuf);
1286 msg[sizeof(msg)-1] = '\0';
1287 memory->syslog(msg);
1288 }
1231 for (int i=0; i<len; i++) { 1289 for (int i=0; i<len; i++) {
1232 if (count == (PENDING_LIMIT-1)) error(NULL); 1290 if (count == (PENDING_LIMIT-1)) error(NULL);
1233 if (st >= end_state) error("finite state machine impossible state"); 1291 if (st >= end_state) error("finite state machine impossible state");
1234 u_char c = buf[i]; 1292 u_char c = buf[i];
1235 pending[count++] = c; 1293 pending[count++] = c;
1294 if (debug_syslog > 10) {
1295 char *old1 = state_names[st];
1296 char *new1 = state_names[parse_table[c][st]];
1297 char msg[200];
1298 snprintf(msg, sizeof(msg), "%s at (%d,%c) switches from %s to %s", myname, i, c, old1, new1);
1299 memory->syslog(msg);
1300 }
1236 st = parse_table[c][st]; 1301 st = parse_table[c][st];
1237 switch (st) { 1302 switch (st) {
1238 1303
1239 ////////////////////////////// 1304 //////////////////////////////
1240 // host name recognizer 1305 // host name recognizer
1461 1526
1462 //////////////////////////////////////////////// 1527 ////////////////////////////////////////////////
1463 // 1528 //
1464 // 1529 //
1465 url_scanner::url_scanner(recorder *memory) { 1530 url_scanner::url_scanner(recorder *memory) {
1466 host_parser = new fsa(h_init, NULL, NULL, memory); 1531 host_parser = new fsa("host_parser", h_init, NULL, NULL, memory);
1467 tags_parser = new fsa(t_init, host_parser, NULL, memory); 1532 tags_parser = new fsa("tags_parser", t_init, host_parser, NULL, memory);
1468 urls_parser = new fsa(u_init, NULL, NULL, memory); 1533 urls_parser = new fsa("urls_parser", u_init, NULL, NULL, memory);
1469 urld_parser = new fsa(d_init, urls_parser, tags_parser, memory); 1534 urld_parser = new fsa("urld_parser", d_init, urls_parser, tags_parser, memory);
1470 html_parser = new fsa(e_init, urld_parser, NULL, memory); 1535 html_parser = new fsa("html_parser", e_init, urld_parser, NULL, memory);
1471 mime_parser = new fsa(m_init, html_parser, NULL, memory); 1536 mime_parser = new fsa("mime_parser", m_init, html_parser, NULL, memory);
1472 b64_parser = new fsa(b_init, mime_parser, NULL, memory); 1537 b64_parser = new fsa("b64_parser ", b_init, mime_parser, NULL, memory);
1473 uu_parser = new fsa(uu_init, b64_parser, NULL, memory); 1538 uu_parser = new fsa("uu_parser ", uu_init, b64_parser, NULL, memory);
1474 } 1539 }
1475 1540
1476 url_scanner::~url_scanner() { 1541 url_scanner::~url_scanner() {
1477 delete host_parser; 1542 delete host_parser;
1478 delete tags_parser; 1543 delete tags_parser;