comparison src/scanner.cpp @ 19:b8f5fa3dd5b8

fix problems in the state transitions causing impossible states
author carl
date Fri, 30 Apr 2004 22:44:56 -0700
parents 041ea016b684
children 06de5ab6a232
comparison
equal deleted inserted replaced
18:041ea016b684 19:b8f5fa3dd5b8
33 b_64, 33 b_64,
34 34
35 // counter for number of columns in the table 35 // counter for number of columns in the table
36 end_state, 36 end_state,
37 37
38 // temporary mime states 38 // temporary states
39 h_end, 39 h_end,
40 t_end, 40 t_end,
41 u_reco, 41 u_reco,
42 e_semi, 42 e_semi,
43 m_2, 43 m_2,
132 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x4E N 132 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x4E N
133 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x4F O 133 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x4F O
134 {h_host, h_host, t_init, t_disc, u_http, u_http, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x50 P 134 {h_host, h_host, t_init, t_disc, u_http, u_http, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x50 P
135 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x51 Q 135 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x51 Q
136 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x52 R 136 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x52 R
137 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x53 S 137 {h_host, h_host, t_init, t_disc, u_http, u_http, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x53 S
138 {h_host, h_host, t_init, t_disc, u_http, u_http, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x54 T 138 {h_host, h_host, t_init, t_disc, u_http, u_http, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x54 T
139 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x55 U 139 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x55 U
140 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x56 V 140 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x56 V
141 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x57 W 141 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x57 W
142 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x58 X 142 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x58 X
164 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x6E n 164 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x6E n
165 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x6F o 165 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x6F o
166 {h_host, h_host, t_init, t_disc, u_http, u_http, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x70 p 166 {h_host, h_host, t_init, t_disc, u_http, u_http, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x70 p
167 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x71 q 167 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x71 q
168 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x72 r 168 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x72 r
169 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x73 s 169 {h_host, h_host, t_init, t_disc, u_http, u_http, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x73 s
170 {h_host, h_host, t_init, t_disc, u_http, u_http, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x74 t 170 {h_host, h_host, t_init, t_disc, u_http, u_http, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x74 t
171 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x75 u 171 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x75 u
172 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x76 v 172 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x76 v
173 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x77 w 173 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x77 w
174 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x78 x 174 {h_host, h_host, t_init, t_disc, u_init, u_init, u_url, u_url, e_init, e_init, e_init, m_init, m_init, m_init, b_init, b_64, b_64, b_64, }, // 0x78 x
847 string_set *hosts; 847 string_set *hosts;
848 848
849 fsa(state init, fsa* next1_, fsa* next2_, string_set *hosts_); 849 fsa(state init, fsa* next1_, fsa* next2_, string_set *hosts_);
850 void push(u_char *buf, int len); 850 void push(u_char *buf, int len);
851 void pusher(); 851 void pusher();
852 void error(char *err);
852 }; 853 };
853 854
854 fsa::fsa(state init_, fsa *next1_, fsa *next2_, string_set *hosts_) { 855 fsa::fsa(state init_, fsa *next1_, fsa *next2_, string_set *hosts_) {
855 count = 0; 856 count = 0;
856 st = init_; 857 st = init_;
858 next1 = next1_; 859 next1 = next1_;
859 next2 = next2_; 860 next2 = next2_;
860 hosts = hosts_; 861 hosts = hosts_;
861 } 862 }
862 863
864 void fsa::error(char *err) {
865 count = 0;
866 st = init;
867 if (err) my_syslog(err);
868 }
869
863 void fsa::pusher() { 870 void fsa::pusher() {
864 if (next1) next1->push(pending, count); 871 if (next1) next1->push(pending, count);
865 if (next2) next2->push(pending, count); 872 if (next2) next2->push(pending, count);
866 count = 0; 873 count = 0;
867 } 874 }
868 875
869 void fsa::push(u_char *buf, int len) { 876 void fsa::push(u_char *buf, int len) {
870 for (int i=0; i<len; i++) { 877 for (int i=0; i<len; i++) {
878 if (count == (PENDING_LIMIT-1)) error(NULL);
879 if (st >= end_state) error("finite state machine impossible state");
871 u_char c = buf[i]; 880 u_char c = buf[i];
872 // guard against buffer overflow
873 if (count == (PENDING_LIMIT-1)) {
874 pusher();
875 st = init;
876 }
877 pending[count++] = c; 881 pending[count++] = c;
878 st = parse_table[c][st]; 882 st = parse_table[c][st];
879 switch (st) { 883 switch (st) {
880 884
881 ////////////////////////////// 885 //////////////////////////////
890 register_string(*hosts, (char*)pending); 894 register_string(*hosts, (char*)pending);
891 break; 895 break;
892 } 896 }
893 } 897 }
894 } 898 }
899 st = h_init;
895 } // fall thru 900 } // fall thru
896 901
897 case h_init: { 902 case h_init: {
898 count = 0; 903 count = 0;
899 } break; 904 } break;
913 } break; 918 } break;
914 919
915 ////////////////////////////// 920 //////////////////////////////
916 // url recognizer 921 // url recognizer
917 case u_sla: { 922 case u_sla: {
918 if ((count < 6) || (7 < count)) { 923 if ((count < 6) || (8 < count)) { // allow http:// or https://
919 count = 0; 924 count = 0;
920 st = u_init; 925 st = u_init;
921 } 926 }
922 } break; 927 } break;
923 928
924 case u_reco: { 929 case u_reco: {
925 if (count > 12) { 930 if (count > 13) { // need some minimal length host name after the protocol
926 pending[count-1] = 0; 931 pending[--count] = '\0'; // null terminate host name by overwriting the terminator
932 char *p = NULL;
927 if (strncasecmp((const char *)pending, "http://", 7) == 0) { 933 if (strncasecmp((const char *)pending, "http://", 7) == 0) {
928 char *p = (char *)pending + 7; 934 p = (char *)pending + 7;
929 if (strchr(p, '.')) register_string(*hosts, p); // require at least one . in a dns name
930 } 935 }
936 if (strncasecmp((const char *)pending, "https://", 8) == 0) {
937 p = (char *)pending + 8;
938 }
939 if (p && strchr(p, '.')) register_string(*hosts, p); // require at least one . in a dns name
931 } 940 }
941 st = u_init;
932 } // fall thru 942 } // fall thru
933 943
934 case u_init: { 944 case u_init: {
935 count = 0; // discard all characters 945 count = 0; // discard all characters
936 } break; 946 } break;