summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Makefile22
-rw-r--r--webcrawler.c487
2 files changed, 509 insertions, 0 deletions
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..814c1e2
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,22 @@
+PROG = webcrawler
+OBJS = $(PROG).o
+CC = gcc
+DBGFLAGS = -g -O0
+ifdef DEBUG
+ CFLAGS = $(DBGFLAGS) -Wall -std=gnu99
+else
+ CFLAGS = -Wall -std=gnu99 -O2 -march=native -mtune=native
+endif
+LDFLAGS = -lm -lcurl
+CURL_LDFLAGS := $(shell curl-config --libs)
+
+$(PROG): $(OBJS)
+ $(CC) $(LDFLAGS) $(OBJS) -o $(PROG)
+
+$(PROG).o: $(PROG).c
+ $(CC) -c $(CFLAGS) $(PROG).c
+
+.PHONY: clean
+
+clean:
+ rm -f *.o ./$(PROG)
diff --git a/webcrawler.c b/webcrawler.c
new file mode 100644
index 0000000..da35724
--- /dev/null
+++ b/webcrawler.c
@@ -0,0 +1,487 @@
+/* webcrawler.c
+ *
+ * Kamil Kaminski
+ * kkamin8@uic.edu
+ *
+ *
+ * Web Crawler
+ * This scans the root url inputted by the user and looks for all <a href=""> and
+ * <img src="" /> tags and crawls from there making sure that is chrooted in root
+ * url
+ *
+ * ToDo: save crawled sites to disk
+ *
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <curl/curl.h>
+
+#define URL_SZ 1024
+
+struct stat_node
+{
+ char url[URL_SZ];
+ char from_url[URL_SZ];
+ struct stat_node *next, *prev;
+};
+typedef struct stat_node stat_node_t;
+
+struct crawl_stats
+{
+ unsigned int visited_n, checked_n, broken_n;
+ FILE *summary_fl, *visited_fl, *checked_fl, *broken_fl;
+ unsigned header_size;
+ char http_header[sizeof(char) * 1024 * 64]; /* 64KB */
+ stat_node_t *url_queue;
+ stat_node_t *tail;
+ stat_node_t *visited_list;
+ char url_root[URL_SZ]; /* the domain getting crawled */
+ char domain[URL_SZ];
+};
+typedef struct crawl_stats crawl_stats_t;
+
+/* function prototypes */
+int add_stat_node(stat_node_t **, stat_node_t **, const char *, const char *);
+int del_stat_node(stat_node_t **);
+size_t write_data(char *, size_t, size_t, void *);
+int curl_prepare(CURL **, crawl_stats_t *, int, char **);
+int curl_finish(CURL *, crawl_stats_t *);
+int curl_fetchpage(CURL *, crawl_stats_t *, const char *);
+
+/* globals */
+static char last_redir_url[URL_SZ];
+
+int add_stat_node(stat_node_t **head, stat_node_t **tail, const char *url, const char *from_url)
+{
+ stat_node_t *node = (stat_node_t *) malloc(sizeof(stat_node_t));
+ memset(node, 0, sizeof(stat_node_t));
+ if (!node)
+ {
+ perror("failed to alloc memory for new node to the url queue");
+ return 1;
+ }
+ else
+ {
+ strncpy(node->url, url, URL_SZ);
+ if (from_url)
+ strncpy(node->from_url, from_url, URL_SZ);
+ if (!*head)
+ {
+ *head = node;
+ if (tail)
+ *tail = node; /* tail bro */
+ }
+ else
+ {
+ (*head)->prev = node;
+ node->next = *head;
+ (*head) = node;
+ }
+ }
+
+ return 0;
+}
+
+int del_stat_node(stat_node_t **tail)
+{
+ stat_node_t *node = *tail;
+ *tail = (*tail)->prev;
+ if (*tail)
+ (*tail)->next = NULL;
+ free(node);
+
+ return 0;
+}
+
+int exists_stat_node(stat_node_t *list, const char *url)
+{
+ stat_node_t *iter = list;
+ while (iter)
+ {
+ if (strncmp(iter->url, url, URL_SZ) == 0)
+ return 0;
+
+ iter = iter->next;
+ }
+
+ return 1;
+}
+
+size_t write_data(char *ptr, size_t size, size_t nmemb, void *userdata)
+{
+ /* return size * nmemb; */
+ return fwrite(ptr, size, nmemb, (FILE *) userdata);
+}
+
+size_t write_header(char *ptr, size_t size, size_t nmemb, void *userdata)
+{
+ crawl_stats_t *p = (crawl_stats_t *) userdata;
+ size_t sz = size * nmemb;
+
+ memcpy(&(p->http_header[p->header_size]), ptr, sz);
+ p->header_size += sz;
+ p->http_header[p->header_size] = 0;
+
+ return sz;
+}
+
+int curl_prepare(CURL **curl_handle, crawl_stats_t *page_stats, int argc, char **argv)
+{
+ if (!(*curl_handle = curl_easy_init()))
+ {
+ perror("failed to create curl handle");
+ return 1;
+ }
+
+ page_stats->summary_fl = fopen(argc == 6 ? argv[2] : "summary", "w");
+ page_stats->visited_fl = fopen(argc == 6 ? argv[3] : "visited", "w");
+ page_stats->checked_fl = fopen(argc == 6 ? argv[4] : "checked", "w");
+ page_stats->broken_fl = fopen(argc == 6 ? argv[5] : "broken", "w");
+
+ if (!page_stats->summary_fl || !page_stats->visited_fl ||
+ !page_stats->checked_fl || !page_stats->broken_fl)
+ {
+ perror("failed to open stat files for writing");
+ return 1;
+ }
+
+ return 0;
+}
+
+int curl_finish(CURL *curl_handle, crawl_stats_t *page_stats)
+{
+ /* always cleanup */
+ curl_easy_cleanup(curl_handle);
+
+ /* write out the summary */
+ fprintf(page_stats->summary_fl,
+ "Visited_pages: %u\n"
+ "Checked_URLs: %u\n"
+ "Broken_URLs: %u\n", page_stats->visited_n, page_stats->checked_n, page_stats->broken_n);
+
+ fclose(page_stats->summary_fl);
+ fclose(page_stats->visited_fl);
+ fclose(page_stats->checked_fl);
+ fclose(page_stats->broken_fl);
+
+ /* dealloc url queue */
+ stat_node_t *iter = page_stats->visited_list;
+ if (iter && iter->next)
+ {
+ while (iter)
+ {
+ free(iter->prev);
+ iter = iter->prev;
+ }
+ }
+ else if (iter)
+ free(iter);
+
+ return 0;
+}
+
+int curl_fetchpage(CURL *curl_handle, crawl_stats_t *page_stats, const char *urlname)
+{
+ CURLcode curlret;
+
+ curl_easy_setopt(curl_handle, CURLOPT_URL, urlname);
+ /* actually not needed, since we set CURLOPT_WRITEHEADER explicitly */
+ //curl_easy_setopt(curl_handle, CURLOPT_HEADER, 1L);
+ /* some servers don't like requests that are made without a user-agent
+ * field, so we provide one */
+ curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");
+ char err_buff[CURL_ERROR_SIZE];
+ curl_easy_setopt(curl_handle, CURLOPT_ERRORBUFFER, err_buff);
+ /* send all data to this function */
+ curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, write_data);
+ curl_easy_setopt(curl_handle, CURLOPT_HEADERFUNCTION, write_header);
+
+ /* by default passes NULL aka stdout */
+ FILE *data;
+ if ((data = fopen("curl.data", "w")))
+ {
+ curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *) data);
+ curl_easy_setopt(curl_handle, CURLOPT_WRITEHEADER, (void *) page_stats);
+ }
+ else
+ {
+ curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, stdout);
+ curl_easy_setopt(curl_handle, CURLOPT_WRITEHEADER, stdout);
+ }
+
+ if ((curlret = curl_easy_perform(curl_handle)) != CURLE_OK)
+ {
+ fprintf(stderr, "failed to fetch %s, %s\n", urlname, err_buff);
+ page_stats->header_size = 0; /* kill current header */
+ fclose(data);
+
+ return 1;
+ }
+
+ int ret = -1;
+ fprintf(stdout, "fetched %s\n", urlname);
+ int code; /* parse html header code */
+ char code_msg[256] = { 0 };
+ sscanf(page_stats->http_header, "HTTP/1.%*[01] %d %[^\r\n] ", &code, code_msg);
+ fprintf(stdout, "\t%d %s, ", code, code_msg);
+ /* printf("%s", page_stats->http_header); */
+ /* ask for the content-type */
+ char *ct;
+ ret = curl_easy_getinfo(curl_handle, CURLINFO_CONTENT_TYPE, &ct);
+ if ((CURLE_OK == ret) && ct)
+ printf("content-type: %s\n", ct);
+
+ if (code >= 400 && code <= 500)
+ {
+ page_stats->header_size = 0; /* kill current header */
+ fclose(data);
+
+ return 2;
+ }
+ else if (code == 301 || code == 302)
+ {
+ char newurl[URL_SZ] = { 0 };
+ char *location = strstr(page_stats->http_header, "Location: ");
+ sscanf(location, "Location: %[^\r\n] ", newurl);
+ if (newurl[0] == '/') /* got relative redir, this sucks */
+ {
+ char fullurl[URL_SZ] = { 0 };
+ snprintf(fullurl, URL_SZ, "%s%s", urlname, urlname[strlen(urlname)-1] == '/' ? &newurl[1] : &newurl[0]);
+ strncpy(newurl, fullurl, URL_SZ);
+ }
+ printf("\tredirecting to: %s\n", strlen(newurl) > 120 ? "too long to display" : newurl);
+ strncpy(last_redir_url, newurl, URL_SZ);
+ page_stats->header_size = 0; /* kill current header */
+ fclose(data);
+
+ return 3;
+ }
+
+ fclose(data);
+ /* external regexp to parse out all (a | img) links */
+ ret = system("sed \'s/>/>/g\' curl.data | sed -r -n \'s/.*<(a|img)[^>]"
+ "*(href|src)=[\"\'\']?([^\" >]*)[\"\'\']?.*$/\\3/p\' | awk '!/#/' > urls.txt");
+
+ /* we need to stay chrooted */
+ if (strstr(urlname, page_stats->url_root))
+ {
+ FILE *urls_fl;
+ if (!(urls_fl = fopen("urls.txt", "r")))
+ perror("failed to add urls to the queue, ignoring");
+ else
+ {
+ fseek(urls_fl, 0L, SEEK_END);
+ size_t file_sz = ftell(urls_fl);
+ fseek(urls_fl, 0L, SEEK_SET);
+ /* this means that there we scanned links in webpage of type text/html */
+ if (file_sz || strstr(ct, "text/html"))
+ {
+ fprintf(page_stats->visited_fl, "%s\n", urlname);
+ page_stats->visited_n++;
+ }
+
+ char url[URL_SZ] = { 0 };
+ while (fgets(url, URL_SZ, urls_fl))
+ {
+ if (strchr(url, '@')) /* I assume these are e-mails */
+ continue;
+ url[strlen(url)-1] = '\0'; /* kill newline */
+
+ if (strstr(url, page_stats->url_root) || (strstr(url, ".com") ||
+ strstr(url, ".org") || strstr(url, ".edu") || strstr(url, ".net") ||
+ strstr(url, ".info") || strstr(url, ".uk") || strstr(url, ".to")))
+ ; /* alright! */
+ else if (url[0] == '/' && url[1] != '~')
+ {
+ char url_absolute[URL_SZ] = { 0 };
+ sprintf(url_absolute, "%s%s", urlname, urlname[strlen(urlname)-1] == '/' ? &url[1] : url);
+ strncpy(url, url_absolute, URL_SZ);
+ }
+ else if (strstr(url, "/~"))
+ {
+ char url_absolute[URL_SZ] = { 0 };
+ sprintf(url_absolute, "%s%s", page_stats->domain, url);
+ strncpy(url, url_absolute, URL_SZ);
+ }
+ else if (strstr(url, "../"))
+ {
+ char url_absolute[URL_SZ] = { 0 };
+ char url_root_dup[URL_SZ] = { 0 };
+ strcpy(url_root_dup, page_stats->url_root);
+ if (url_root_dup[strlen(url_root_dup)-1] == '/')
+ url_root_dup[strlen(url_root_dup)-1] = '\0';
+ char *last = strrchr(url_root_dup, '/');
+ *last = 0;
+
+ sprintf(url_absolute, "%s/%s", url_root_dup, &url[3]);
+ strncpy(url, url_absolute, URL_SZ);
+ }
+ else if (strstr(url, "./"))
+ {
+ char url_absolute[URL_SZ] = { 0 };
+ sprintf(url_absolute, "%s%s", page_stats->url_root, &url[2]);
+ strncpy(url, url_absolute, URL_SZ);
+ }
+ else if (strstr(url, "http:") && !strstr(url, "http://"))
+ {
+ char *colon = strchr(url, ':');
+ char url_absolute[URL_SZ] = { 0 };
+ sprintf(url_absolute, "%s%s", page_stats->url_root, &colon[1]);
+ strncpy(url, url_absolute, URL_SZ);
+ }
+ else /* relative webpage, e.g. page.html */
+ {
+ if (urlname[strlen(urlname)-1] == '/')
+ {
+ char url_absolute[URL_SZ] = { 0 };
+ sprintf(url_absolute, "%s%s%s", urlname,
+ urlname[strlen(urlname)-1] == '/' ? "" : "/", url);
+ strncpy(url, url_absolute, URL_SZ);
+ }
+ else
+ { /* e.g. domain.com/dir1/page1.html and user requests page2.html */
+ char url_absolute[URL_SZ] = { 0 };
+ char urlname_dup[URL_SZ] = { 0 };
+ strncpy(urlname_dup, urlname, URL_SZ);
+ char *lastslash = strrchr(urlname_dup, '/');
+ lastslash[1] = '\0';
+ sprintf(url_absolute, "%s%s", urlname_dup, url);
+ strncpy(url, url_absolute, URL_SZ);
+ }
+ }
+
+ if (exists_stat_node(page_stats->url_queue, url) == 1) /* avoid dups */
+ {
+ add_stat_node(&page_stats->url_queue, &page_stats->tail, url, urlname);
+ printf("\tqueued for fetching: %s\n", url);
+ }
+ else
+ printf("\talready queued up: %s\n", url);
+
+ memset(url, 0, sizeof(url)); /* fgets is quite silly */
+ }
+
+ fclose(urls_fl);
+ }
+ }
+ else
+ fprintf(stdout, "\tnot scanning %s for links\n", urlname);
+
+ page_stats->header_size = 0; /* kill current header */
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ if (argc != 2 && argc != 6)
+ {
+ fprintf(stderr, "usage: %s <url>\n", argv[0]);
+ fprintf(stderr, "usage: %s <url> <summary> <visited> <checked> <broken>\n", argv[0]);
+ return 1;
+ }
+
+ CURL *curl_handle;
+ crawl_stats_t page_stats;
+ memset(&page_stats, 0, sizeof(page_stats));
+ if (curl_prepare(&curl_handle, &page_stats, argc, argv) != 0)
+ return 1;
+
+ /* call our curl routine to fetch a webpage */
+ int ret = -1;
+ int wasredir = 0;
+
+ /* make sure url ends with / */
+ char *url_arg = strndup(argv[1], URL_SZ - 1);
+ size_t url_arg_sz = strlen(url_arg);
+ if (url_arg[url_arg_sz - 1] != '/')
+ {
+ url_arg[url_arg_sz] = '/';
+ url_arg[url_arg_sz + 1] = '\0';
+ }
+
+ /* figure out initial domain name and root url */
+ char initial_domain[URL_SZ] = { 0 };
+ char domain[URL_SZ] = { 0 };
+ char protocol[32] = { 0 };
+ /* sscanf(url_arg, "%*[(http:|ftp:)]//%[^/]", domain); */
+ sscanf(url_arg, "%[^:]://%[^/]", protocol, domain);
+ snprintf(initial_domain, URL_SZ, "%s://%s", protocol, domain);
+ printf("initial domain is: %s\n", initial_domain);
+ strncpy(page_stats.domain, initial_domain, URL_SZ);
+ strncpy(page_stats.url_root, url_arg, URL_SZ);
+
+ do
+ { /* loop through all redirs until we reach root url */
+ ret = curl_fetchpage(curl_handle, &page_stats, wasredir ? last_redir_url : url_arg); /* kickstart */
+ if (ret == 3)
+ {
+ wasredir = 1;
+ strncpy(page_stats.url_root, last_redir_url, URL_SZ); /* deal with redir of root url */
+ }
+ } while (ret == 3);
+
+ if (ret == 1 || ret == 2)
+ {
+ fprintf(page_stats.broken_fl, "%s\n", wasredir ? last_redir_url : url_arg);
+ fprintf(page_stats.checked_fl, "%s\n", wasredir ? last_redir_url : url_arg);
+ page_stats.checked_n++;
+ page_stats.broken_n++;
+ }
+ else if (ret == 0)
+ {
+ add_stat_node(&(page_stats.visited_list), NULL, wasredir ? last_redir_url : url_arg, NULL); /* mark url as visited */
+ fprintf(page_stats.checked_fl, "%s\n", wasredir ? last_redir_url : url_arg);
+ page_stats.checked_n++;
+
+ while (page_stats.tail)
+ {
+ char url[URL_SZ] = { 0 };
+ char from_url[URL_SZ] = { 0 };
+ strncpy(url, page_stats.tail->url, URL_SZ);
+ strncpy(from_url, page_stats.tail->from_url, URL_SZ);
+ del_stat_node(&(page_stats.tail)); /* dequeue */
+ if (exists_stat_node(page_stats.visited_list, url) == 0)
+ {
+ fprintf(stderr, "already visited: %s\n", url);
+ continue;
+ }
+
+ ret = curl_fetchpage(curl_handle, &page_stats, url);
+ add_stat_node(&(page_stats.visited_list), NULL, url, NULL); /* mark url as visited */
+
+ fprintf(page_stats.checked_fl, "%s\n", url);
+ page_stats.checked_n++;
+
+ switch (ret)
+ {
+ case 2: /* broken */
+ fprintf(page_stats.broken_fl, "%s from %s\n", url, from_url);
+ page_stats.broken_n++;
+ break;
+ case 3: /* redir */
+ if (strlen(last_redir_url) == URL_SZ - 1)
+ fprintf(stderr, "\tignoring very long redir, stupid host\n");
+ else
+ add_stat_node(&(page_stats.url_queue), &(page_stats.tail), last_redir_url, NULL);
+ break;
+ case 4:
+ fprintf(stderr, "what the heck just happened!\n");
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ /* cleanup */
+ curl_finish(curl_handle, &page_stats);
+ if (url_arg)
+ free(url_arg);
+
+ return 0;
+}
+