From b68fa10533f3d96ffdee2e5eaa9a75048ef368dd Mon Sep 17 00:00:00 2001 From: Kyle K Date: Thu, 24 Nov 2011 00:01:01 -0600 Subject: initial commit --- Makefile | 22 +++ webcrawler.c | 487 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 509 insertions(+) create mode 100644 Makefile create mode 100644 webcrawler.c diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..814c1e2 --- /dev/null +++ b/Makefile @@ -0,0 +1,22 @@ +PROG = webcrawler +OBJS = $(PROG).o +CC = gcc +DBGFLAGS = -g -O0 +ifdef DEBUG + CFLAGS = $(DBGFLAGS) -Wall -std=gnu99 +else + CFLAGS = -Wall -std=gnu99 -O2 -march=native -mtune=native +endif +LDFLAGS = -lm -lcurl +CURL_LDFLAGS := $(shell curl-config --libs) + +$(PROG): $(OBJS) + $(CC) $(LDFLAGS) $(OBJS) -o $(PROG) + +$(PROG).o: $(PROG).c + $(CC) -c $(CFLAGS) $(PROG).c + +.PHONY: clean + +clean: + rm -f *.o ./$(PROG) diff --git a/webcrawler.c b/webcrawler.c new file mode 100644 index 0000000..da35724 --- /dev/null +++ b/webcrawler.c @@ -0,0 +1,487 @@ +/* webcrawler.c + * + * Kamil Kaminski + * kkamin8@uic.edu + * + * + * Web Crawler + * This scans the root url inputted by the user and looks for all and + * tags and crawls from there making sure that is chrooted in root + * url + * + * ToDo: save crawled sites to disk + * + * + */ + +#include +#include +#include +#include +#include + +#define URL_SZ 1024 + +struct stat_node +{ + char url[URL_SZ]; + char from_url[URL_SZ]; + struct stat_node *next, *prev; +}; +typedef struct stat_node stat_node_t; + +struct crawl_stats +{ + unsigned int visited_n, checked_n, broken_n; + FILE *summary_fl, *visited_fl, *checked_fl, *broken_fl; + unsigned header_size; + char http_header[sizeof(char) * 1024 * 64]; /* 64KB */ + stat_node_t *url_queue; + stat_node_t *tail; + stat_node_t *visited_list; + char url_root[URL_SZ]; /* the domain getting crawled */ + char domain[URL_SZ]; +}; +typedef struct crawl_stats crawl_stats_t; + +/* function prototypes */ +int add_stat_node(stat_node_t **, stat_node_t **, const char *, const char *); +int del_stat_node(stat_node_t **); +size_t write_data(char *, size_t, size_t, void *); +int curl_prepare(CURL **, crawl_stats_t *, int, char **); +int curl_finish(CURL *, crawl_stats_t *); +int curl_fetchpage(CURL *, crawl_stats_t *, const char *); + +/* globals */ +static char last_redir_url[URL_SZ]; + +int add_stat_node(stat_node_t **head, stat_node_t **tail, const char *url, const char *from_url) +{ + stat_node_t *node = (stat_node_t *) malloc(sizeof(stat_node_t)); + memset(node, 0, sizeof(stat_node_t)); + if (!node) + { + perror("failed to alloc memory for new node to the url queue"); + return 1; + } + else + { + strncpy(node->url, url, URL_SZ); + if (from_url) + strncpy(node->from_url, from_url, URL_SZ); + if (!*head) + { + *head = node; + if (tail) + *tail = node; /* tail bro */ + } + else + { + (*head)->prev = node; + node->next = *head; + (*head) = node; + } + } + + return 0; +} + +int del_stat_node(stat_node_t **tail) +{ + stat_node_t *node = *tail; + *tail = (*tail)->prev; + if (*tail) + (*tail)->next = NULL; + free(node); + + return 0; +} + +int exists_stat_node(stat_node_t *list, const char *url) +{ + stat_node_t *iter = list; + while (iter) + { + if (strncmp(iter->url, url, URL_SZ) == 0) + return 0; + + iter = iter->next; + } + + return 1; +} + +size_t write_data(char *ptr, size_t size, size_t nmemb, void *userdata) +{ + /* return size * nmemb; */ + return fwrite(ptr, size, nmemb, (FILE *) userdata); +} + +size_t write_header(char *ptr, size_t size, size_t nmemb, void *userdata) +{ + crawl_stats_t *p = (crawl_stats_t *) userdata; + size_t sz = size * nmemb; + + memcpy(&(p->http_header[p->header_size]), ptr, sz); + p->header_size += sz; + p->http_header[p->header_size] = 0; + + return sz; +} + +int curl_prepare(CURL **curl_handle, crawl_stats_t *page_stats, int argc, char **argv) +{ + if (!(*curl_handle = curl_easy_init())) + { + perror("failed to create curl handle"); + return 1; + } + + page_stats->summary_fl = fopen(argc == 6 ? argv[2] : "summary", "w"); + page_stats->visited_fl = fopen(argc == 6 ? argv[3] : "visited", "w"); + page_stats->checked_fl = fopen(argc == 6 ? argv[4] : "checked", "w"); + page_stats->broken_fl = fopen(argc == 6 ? argv[5] : "broken", "w"); + + if (!page_stats->summary_fl || !page_stats->visited_fl || + !page_stats->checked_fl || !page_stats->broken_fl) + { + perror("failed to open stat files for writing"); + return 1; + } + + return 0; +} + +int curl_finish(CURL *curl_handle, crawl_stats_t *page_stats) +{ + /* always cleanup */ + curl_easy_cleanup(curl_handle); + + /* write out the summary */ + fprintf(page_stats->summary_fl, + "Visited_pages: %u\n" + "Checked_URLs: %u\n" + "Broken_URLs: %u\n", page_stats->visited_n, page_stats->checked_n, page_stats->broken_n); + + fclose(page_stats->summary_fl); + fclose(page_stats->visited_fl); + fclose(page_stats->checked_fl); + fclose(page_stats->broken_fl); + + /* dealloc url queue */ + stat_node_t *iter = page_stats->visited_list; + if (iter && iter->next) + { + while (iter) + { + free(iter->prev); + iter = iter->prev; + } + } + else if (iter) + free(iter); + + return 0; +} + +int curl_fetchpage(CURL *curl_handle, crawl_stats_t *page_stats, const char *urlname) +{ + CURLcode curlret; + + curl_easy_setopt(curl_handle, CURLOPT_URL, urlname); + /* actually not needed, since we set CURLOPT_WRITEHEADER explicitly */ + //curl_easy_setopt(curl_handle, CURLOPT_HEADER, 1L); + /* some servers don't like requests that are made without a user-agent + * field, so we provide one */ + curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0"); + char err_buff[CURL_ERROR_SIZE]; + curl_easy_setopt(curl_handle, CURLOPT_ERRORBUFFER, err_buff); + /* send all data to this function */ + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, write_data); + curl_easy_setopt(curl_handle, CURLOPT_HEADERFUNCTION, write_header); + + /* by default passes NULL aka stdout */ + FILE *data; + if ((data = fopen("curl.data", "w"))) + { + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *) data); + curl_easy_setopt(curl_handle, CURLOPT_WRITEHEADER, (void *) page_stats); + } + else + { + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, stdout); + curl_easy_setopt(curl_handle, CURLOPT_WRITEHEADER, stdout); + } + + if ((curlret = curl_easy_perform(curl_handle)) != CURLE_OK) + { + fprintf(stderr, "failed to fetch %s, %s\n", urlname, err_buff); + page_stats->header_size = 0; /* kill current header */ + fclose(data); + + return 1; + } + + int ret = -1; + fprintf(stdout, "fetched %s\n", urlname); + int code; /* parse html header code */ + char code_msg[256] = { 0 }; + sscanf(page_stats->http_header, "HTTP/1.%*[01] %d %[^\r\n] ", &code, code_msg); + fprintf(stdout, "\t%d %s, ", code, code_msg); + /* printf("%s", page_stats->http_header); */ + /* ask for the content-type */ + char *ct; + ret = curl_easy_getinfo(curl_handle, CURLINFO_CONTENT_TYPE, &ct); + if ((CURLE_OK == ret) && ct) + printf("content-type: %s\n", ct); + + if (code >= 400 && code <= 500) + { + page_stats->header_size = 0; /* kill current header */ + fclose(data); + + return 2; + } + else if (code == 301 || code == 302) + { + char newurl[URL_SZ] = { 0 }; + char *location = strstr(page_stats->http_header, "Location: "); + sscanf(location, "Location: %[^\r\n] ", newurl); + if (newurl[0] == '/') /* got relative redir, this sucks */ + { + char fullurl[URL_SZ] = { 0 }; + snprintf(fullurl, URL_SZ, "%s%s", urlname, urlname[strlen(urlname)-1] == '/' ? &newurl[1] : &newurl[0]); + strncpy(newurl, fullurl, URL_SZ); + } + printf("\tredirecting to: %s\n", strlen(newurl) > 120 ? "too long to display" : newurl); + strncpy(last_redir_url, newurl, URL_SZ); + page_stats->header_size = 0; /* kill current header */ + fclose(data); + + return 3; + } + + fclose(data); + /* external regexp to parse out all (a | img) links */ + ret = system("sed \'s/>/>/g\' curl.data | sed -r -n \'s/.*<(a|img)[^>]" + "*(href|src)=[\"\'\']?([^\" >]*)[\"\'\']?.*$/\\3/p\' | awk '!/#/' > urls.txt"); + + /* we need to stay chrooted */ + if (strstr(urlname, page_stats->url_root)) + { + FILE *urls_fl; + if (!(urls_fl = fopen("urls.txt", "r"))) + perror("failed to add urls to the queue, ignoring"); + else + { + fseek(urls_fl, 0L, SEEK_END); + size_t file_sz = ftell(urls_fl); + fseek(urls_fl, 0L, SEEK_SET); + /* this means that there we scanned links in webpage of type text/html */ + if (file_sz || strstr(ct, "text/html")) + { + fprintf(page_stats->visited_fl, "%s\n", urlname); + page_stats->visited_n++; + } + + char url[URL_SZ] = { 0 }; + while (fgets(url, URL_SZ, urls_fl)) + { + if (strchr(url, '@')) /* I assume these are e-mails */ + continue; + url[strlen(url)-1] = '\0'; /* kill newline */ + + if (strstr(url, page_stats->url_root) || (strstr(url, ".com") || + strstr(url, ".org") || strstr(url, ".edu") || strstr(url, ".net") || + strstr(url, ".info") || strstr(url, ".uk") || strstr(url, ".to"))) + ; /* alright! */ + else if (url[0] == '/' && url[1] != '~') + { + char url_absolute[URL_SZ] = { 0 }; + sprintf(url_absolute, "%s%s", urlname, urlname[strlen(urlname)-1] == '/' ? &url[1] : url); + strncpy(url, url_absolute, URL_SZ); + } + else if (strstr(url, "/~")) + { + char url_absolute[URL_SZ] = { 0 }; + sprintf(url_absolute, "%s%s", page_stats->domain, url); + strncpy(url, url_absolute, URL_SZ); + } + else if (strstr(url, "../")) + { + char url_absolute[URL_SZ] = { 0 }; + char url_root_dup[URL_SZ] = { 0 }; + strcpy(url_root_dup, page_stats->url_root); + if (url_root_dup[strlen(url_root_dup)-1] == '/') + url_root_dup[strlen(url_root_dup)-1] = '\0'; + char *last = strrchr(url_root_dup, '/'); + *last = 0; + + sprintf(url_absolute, "%s/%s", url_root_dup, &url[3]); + strncpy(url, url_absolute, URL_SZ); + } + else if (strstr(url, "./")) + { + char url_absolute[URL_SZ] = { 0 }; + sprintf(url_absolute, "%s%s", page_stats->url_root, &url[2]); + strncpy(url, url_absolute, URL_SZ); + } + else if (strstr(url, "http:") && !strstr(url, "http://")) + { + char *colon = strchr(url, ':'); + char url_absolute[URL_SZ] = { 0 }; + sprintf(url_absolute, "%s%s", page_stats->url_root, &colon[1]); + strncpy(url, url_absolute, URL_SZ); + } + else /* relative webpage, e.g. page.html */ + { + if (urlname[strlen(urlname)-1] == '/') + { + char url_absolute[URL_SZ] = { 0 }; + sprintf(url_absolute, "%s%s%s", urlname, + urlname[strlen(urlname)-1] == '/' ? "" : "/", url); + strncpy(url, url_absolute, URL_SZ); + } + else + { /* e.g. domain.com/dir1/page1.html and user requests page2.html */ + char url_absolute[URL_SZ] = { 0 }; + char urlname_dup[URL_SZ] = { 0 }; + strncpy(urlname_dup, urlname, URL_SZ); + char *lastslash = strrchr(urlname_dup, '/'); + lastslash[1] = '\0'; + sprintf(url_absolute, "%s%s", urlname_dup, url); + strncpy(url, url_absolute, URL_SZ); + } + } + + if (exists_stat_node(page_stats->url_queue, url) == 1) /* avoid dups */ + { + add_stat_node(&page_stats->url_queue, &page_stats->tail, url, urlname); + printf("\tqueued for fetching: %s\n", url); + } + else + printf("\talready queued up: %s\n", url); + + memset(url, 0, sizeof(url)); /* fgets is quite silly */ + } + + fclose(urls_fl); + } + } + else + fprintf(stdout, "\tnot scanning %s for links\n", urlname); + + page_stats->header_size = 0; /* kill current header */ + return 0; +} + +int main(int argc, char **argv) +{ + if (argc != 2 && argc != 6) + { + fprintf(stderr, "usage: %s \n", argv[0]); + fprintf(stderr, "usage: %s \n", argv[0]); + return 1; + } + + CURL *curl_handle; + crawl_stats_t page_stats; + memset(&page_stats, 0, sizeof(page_stats)); + if (curl_prepare(&curl_handle, &page_stats, argc, argv) != 0) + return 1; + + /* call our curl routine to fetch a webpage */ + int ret = -1; + int wasredir = 0; + + /* make sure url ends with / */ + char *url_arg = strndup(argv[1], URL_SZ - 1); + size_t url_arg_sz = strlen(url_arg); + if (url_arg[url_arg_sz - 1] != '/') + { + url_arg[url_arg_sz] = '/'; + url_arg[url_arg_sz + 1] = '\0'; + } + + /* figure out initial domain name and root url */ + char initial_domain[URL_SZ] = { 0 }; + char domain[URL_SZ] = { 0 }; + char protocol[32] = { 0 }; + /* sscanf(url_arg, "%*[(http:|ftp:)]//%[^/]", domain); */ + sscanf(url_arg, "%[^:]://%[^/]", protocol, domain); + snprintf(initial_domain, URL_SZ, "%s://%s", protocol, domain); + printf("initial domain is: %s\n", initial_domain); + strncpy(page_stats.domain, initial_domain, URL_SZ); + strncpy(page_stats.url_root, url_arg, URL_SZ); + + do + { /* loop through all redirs until we reach root url */ + ret = curl_fetchpage(curl_handle, &page_stats, wasredir ? last_redir_url : url_arg); /* kickstart */ + if (ret == 3) + { + wasredir = 1; + strncpy(page_stats.url_root, last_redir_url, URL_SZ); /* deal with redir of root url */ + } + } while (ret == 3); + + if (ret == 1 || ret == 2) + { + fprintf(page_stats.broken_fl, "%s\n", wasredir ? last_redir_url : url_arg); + fprintf(page_stats.checked_fl, "%s\n", wasredir ? last_redir_url : url_arg); + page_stats.checked_n++; + page_stats.broken_n++; + } + else if (ret == 0) + { + add_stat_node(&(page_stats.visited_list), NULL, wasredir ? last_redir_url : url_arg, NULL); /* mark url as visited */ + fprintf(page_stats.checked_fl, "%s\n", wasredir ? last_redir_url : url_arg); + page_stats.checked_n++; + + while (page_stats.tail) + { + char url[URL_SZ] = { 0 }; + char from_url[URL_SZ] = { 0 }; + strncpy(url, page_stats.tail->url, URL_SZ); + strncpy(from_url, page_stats.tail->from_url, URL_SZ); + del_stat_node(&(page_stats.tail)); /* dequeue */ + if (exists_stat_node(page_stats.visited_list, url) == 0) + { + fprintf(stderr, "already visited: %s\n", url); + continue; + } + + ret = curl_fetchpage(curl_handle, &page_stats, url); + add_stat_node(&(page_stats.visited_list), NULL, url, NULL); /* mark url as visited */ + + fprintf(page_stats.checked_fl, "%s\n", url); + page_stats.checked_n++; + + switch (ret) + { + case 2: /* broken */ + fprintf(page_stats.broken_fl, "%s from %s\n", url, from_url); + page_stats.broken_n++; + break; + case 3: /* redir */ + if (strlen(last_redir_url) == URL_SZ - 1) + fprintf(stderr, "\tignoring very long redir, stupid host\n"); + else + add_stat_node(&(page_stats.url_queue), &(page_stats.tail), last_redir_url, NULL); + break; + case 4: + fprintf(stderr, "what the heck just happened!\n"); + break; + default: + break; + } + } + } + + /* cleanup */ + curl_finish(curl_handle, &page_stats); + if (url_arg) + free(url_arg); + + return 0; +} + -- cgit v1.2.3