view src/scanner.h @ 198:fbcf2733fe79

always rebuild from clean source control tip, rather than from working directory
author Carl Byington <carl@five-ten-sg.com>
date Sat, 02 Feb 2008 11:58:46 -0800
parents c7fc218686f5
children 82886d4dd71f
line wrap: on
line source

/*

Copyright (c) 2007 Carl Byington - 510 Software Group, released under
the GPL version 3 or any later version at your choice available at
http://www.gnu.org/licenses/gpl-3.0.txt

*/

#ifndef scanner_include
#define scanner_include

#include "dnsbl.h"

////////////////////////////////////////////////
// memory for the content scanner
//
class recorder
{
	mlfiPriv	*priv;		// needed for syslog
	string_set	*html_tags; // valid tags
	string_set	*tlds;		// valid tlds
	string_set	*cctlds;	// valid cctlds
	string_set	hosts;
	int 		bad_html_tags;
	int 		binary_tags;

public:
	recorder(mlfiPriv *priv_, string_set &html_tags_, string_set &tlds_, string_set &cctlds_);
	~recorder() 								{ empty(); };
	void empty();
	void new_url(char *host);
	void new_tag(char *tag);
	void binary();
	void syslog(char *buf)						{ my_syslog(priv, buf); 															};
	mlfiPriv   *get_priv()						{ return priv;																		};
	string_set *get_cctlds()					{ return cctlds;																	};
	string_set *get_tlds()						{ return tlds;																		};
	string_set &get_hosts() 					{ return hosts; 																	};
	bool		excessive_bad_tags(int limit)	{ return (limit > 0) && (bad_html_tags > limit) && (bad_html_tags > 3*binary_tags); };
	bool		excessive_hosts(int limit)		{ return (limit > 0) && (hosts.size() > limit); 									};

};


////////////////////////////////////////////////
// the content scanner
//
class fsa;
class url_scanner {
	fsa *host_parser;
	fsa *tags_parser;
	fsa *urls_parser;
	fsa *urld_parser;
	fsa *html_parser;
	fsa *mime_parser;
	fsa *b64_parser;
	fsa *uu_parser;

public:
	url_scanner(recorder *memory);
	~url_scanner();
	void scan(u_char *buffer, size_t length);
};

#endif