summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKyle K <kylek389@gmail.com>2011-07-06 20:22:06 -0500
committerKamil Kaminski <kamilkss@gmail.com>2011-07-06 20:22:06 -0500
commit1643ba9d4fd90f4a44c5a70207e37c524b2ae665 (patch)
treee5107f1a21b227ec48259a31dba9e558d7831290
parentfd6f82b5c6b3869d893e48b4b25ba6bf195eeb3a (diff)
downloadsandbox-1643ba9d4fd90f4a44c5a70207e37c524b2ae665.tar.gz
sandbox-1643ba9d4fd90f4a44c5a70207e37c524b2ae665.tar.bz2
sandbox-1643ba9d4fd90f4a44c5a70207e37c524b2ae665.zip
add new try at a tokenizer
-rw-r--r--Makefile2
-rw-r--r--dup.c2
-rw-r--r--string_tokenizer.c353
3 files changed, 356 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index c12e440..59b02eb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
BINS = ascii class depipe_strings dup fpipe pipe realloc strpbrk strsep \
- tokenizer getopt prime_mask linked_list pi_bbp
+ tokenizer getopt prime_mask linked_list pi_bbp string_tokenizer
CC = gcc
CFLAGS = -Wall -std=gnu99 -pedantic
DBGFLAGS = -g -O0
diff --git a/dup.c b/dup.c
index e70b804..7112628 100644
--- a/dup.c
+++ b/dup.c
@@ -53,6 +53,8 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}
+ ret = ret; /* hush the dumb compiler */
+
close(pfds[0]);
close(pfds[1]);
diff --git a/string_tokenizer.c b/string_tokenizer.c
new file mode 100644
index 0000000..2a5998c
--- /dev/null
+++ b/string_tokenizer.c
@@ -0,0 +1,353 @@
+/* string_tokenizer.c
+ *
+ * Notes: Seems like strsep and linked list is a way to go, but then original
+ * string is still destroyed with strsep. So the idea is to work on a copy
+ * of a string and let strsep to its job. One pitfall of a linked list
+ * is that accessing data is not as flexible as with arrays
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <libgen.h>
+
+void str2dArrTest(void);
+void strTokenizerTestPrint(char *, const char *);
+size_t strStrtokNumOfTokens(const char *, const char *);
+char **strTokenize(const char *, const char *, const size_t);
+void strTokenizeFree(char **);
+void strTokenizeAndPrint(const char *, const char *, const size_t);
+extern inline int isDelim(const char, const char *);
+size_t numOfTokens(const char *, const char *);
+#ifdef _WIN32
+char *strsep(char**, const char*);
+#endif
+
+void areNumOfTokensFuncConsistent(const char *, const char *);
+void testTokenizer(void);
+void strsepTest(void);
+
+struct tokenNode
+{
+ const char *tok;
+ struct tokenNode *next;
+};
+
+/* this function assumes that a copy of a str buffer has already been made */
+struct tokenNode *strTokenizeLinkedList(char *str, const char *delim)
+{
+ if (!str || !delim)
+ return NULL;
+
+ struct tokenNode *head = (struct tokenNode *) malloc(sizeof(struct tokenNode));
+ if (!head )
+ {
+ fprintf(stderr, "strTokenizeLinkedList failed, %s:%d\n", __FILE__, __LINE__);
+ perror("malloc");
+ exit(-1);
+ }
+
+ char *tok;
+ struct tokenNode *newNode;
+ while ((tok = strsep(&str, delim)) != NULL)
+ {
+ newNode = (struct tokenNode *) malloc(sizeof(struct tokenNode));
+ newNode->tok = tok;
+ newNode->next = head;
+
+ head = newNode;
+ }
+
+ return head;
+}
+
+#ifdef _WIN32
+/* this function is insane in a good way */
+char *strsep(char** stringp, const char* delim)
+{
+ char *result;
+
+ if ((stringp == NULL) || (*stringp == NULL))
+ return NULL;
+
+ result = *stringp;
+
+ while (**stringp && !strchr(delim, **stringp))
+ ++*stringp;
+
+ if (**stringp)
+ *(*stringp)++ = '\0';
+ else
+ *stringp = NULL;
+
+ return result;
+}
+#endif
+
+void strsepTest(void)
+{
+ char *buffer, *words[100], *aPtr;
+ int count = 0, i;
+
+ buffer = strdup("The quick brown fox jumps over the lazy dog");
+ if (!buffer)
+ return;
+
+ while ((aPtr = strsep(&buffer, " ")) && count < 100)
+ words[count++] = aPtr;
+
+ for (i = 0; i < count; i++)
+ printf("%s\n", words[i]);
+}
+
+void str2dArrTest(void)
+{
+ char *str_arr[200];
+
+ *str_arr = (char *) malloc(sizeof(char *) * 200);
+
+ int i;
+ for (i = 0; i < 200; i++)
+ str_arr[i] = (char *) malloc(sizeof(char) * 31);
+
+ strcpy(str_arr[0], "Hello");
+ strcpy(str_arr[1], ", world!");
+
+ printf("%s", str_arr[0]);
+ printf("%s", str_arr[1]);
+}
+
+/* warning, str will get modified */
+void strTokenizerTestPrint(char *str, const char *delim)
+{
+ char *pch;
+
+ printf("splitting string \"%s\" into tokens: \n", str);
+ pch = strtok(str, delim);
+
+ while (pch != NULL)
+ {
+ printf("%s ", pch);
+ pch = strtok(NULL, delim);
+ }
+}
+
+size_t strStrtokNumOfTokens(const char *str, const char *delim)
+{
+ if (!str || !delim)
+ return 0;
+
+ /* make a copy of a string since strtok modifies it */
+ char *strcopy = (char *) malloc(strlen(str) + 1);
+ if (!strcopy)
+ {
+ fprintf(stderr, "strStrtokNumOfTokens failed, %s:%d\n", __FILE__, __LINE__);
+ perror("malloc");
+ exit(-1);
+ }
+ strcpy(strcopy, str);
+
+ char *pch;
+ size_t ret;
+ pch = strtok(strcopy, delim);
+
+ for (ret = 0; pch != NULL; ret++)
+ pch = strtok(NULL, delim);
+
+ free(strcopy);
+
+ return ret;
+}
+
+/* returns tokenized array of strings / char pointers where the last token
+ * in the array points to NULL address, this function fills the array of strings
+ * with a maximum number of maxTokenSize characters if a token exceeds that limit
+ */
+char **strTokenize(const char *str, const char *delim, const size_t maxTokenSize)
+{
+ if (maxTokenSize < 1 || maxTokenSize > 100) /* keep it sane */
+ return NULL;
+
+ size_t tokensNum;
+ if ((tokensNum = strStrtokNumOfTokens(str, delim)) == 0)
+ return NULL;
+
+ /* make a copy of a string since strtok modifies it */
+ char *strcopy = (char *) malloc(strlen(str) + 1);
+ if (!strcopy)
+ {
+ fprintf(stderr, "strTokenize failed, %s:%d\n", __FILE__, __LINE__);
+ perror("malloc");
+ exit(-1);
+ }
+ strcpy(strcopy, str);
+
+ /* char *tokensArr[tokensNum + 1]; */
+ char **tokensArr;
+ size_t sizeOfTokenPtrArr = sizeof(char *) * (tokensNum + 1);
+ tokensArr = (char **) malloc(sizeOfTokenPtrArr);
+ if (!tokensArr)
+ {
+ fprintf(stderr, "strTokenize failed, %s:%d\n", __FILE__, __LINE__);
+ perror("malloc");
+ exit(-1);
+ }
+ memset(tokensArr, 0, sizeOfTokenPtrArr);
+
+ int i;
+ char *pch;
+ for (i = 0; i < tokensNum; i++)
+ {
+ tokensArr[i] = (char *) malloc(sizeof(char) * (maxTokenSize + 1));
+ if (!(tokensArr[i]))
+ {
+ fprintf(stderr, "strTokenize failed, %s:%d\n", __FILE__, __LINE__);
+ perror("malloc");
+ exit(-1);
+ }
+ }
+
+ size_t len;
+ pch = strtok(strcopy, delim);
+ for (i = 0; pch != NULL; i++)
+ {
+ len = strlen(pch);
+
+ if (len <= maxTokenSize)
+ strcpy(tokensArr[i], pch);
+ else
+ {
+ strncpy(tokensArr[i], pch, maxTokenSize);
+ tokensArr[i][maxTokenSize] = '\0';
+ }
+
+ pch = strtok(NULL, delim);
+ }
+
+ return tokensArr;
+}
+
+void strTokenizeFree(char **tokenArr)
+{
+ if (!tokenArr)
+ return;
+
+ int i;
+ for (i = 0; tokenArr[i] != NULL; i++)
+ free(tokenArr[i]);
+}
+
+void strTokenizeAndPrint(const char *strings, const char *delim, const size_t maxTokenLen)
+{
+ char **tokenized;
+
+ if ((tokenized = strTokenize(strings, delim, maxTokenLen)) == NULL)
+ {
+ fprintf(stderr, "strTokenizeAndPrint failed\n");
+ exit(-1);
+ }
+
+ int i;
+ for (i = 0; tokenized[i] != NULL; i++)
+ printf("token %d: \"%s\"\n", i+1, tokenized[i]);
+
+ strTokenizeFree(tokenized);
+}
+
+inline int isDelim(const char c, const char *delim)
+{
+ if (!delim)
+ return 0;
+
+ size_t delimLen = strlen(delim);
+ char d;
+ int i;
+ for (i = 0; i < delimLen; i++)
+ {
+ d = delim[i];
+ if (c == d)
+ return 1;
+ }
+
+ return 0;
+}
+
+size_t numOfTokens(const char *str, const char *delim)
+{
+ if (!str || !delim)
+ return 0;
+ else if (str[0] == '\0')
+ return 0;
+
+ size_t numOfTokens = 0;
+ int delimLast = 0;
+ int delimCurr = 0;
+
+ int i;
+ delimLast = isDelim(str[0], delim);
+ for (i = 1; str[i] != '\0'; i++)
+ {
+ delimCurr = isDelim(str[i], delim);
+
+ if (!delimLast && delimCurr)
+ numOfTokens++;
+
+ delimLast = delimCurr;
+ }
+
+ if (!delimLast)
+ numOfTokens++;
+
+ return numOfTokens;
+}
+
+/* test */
+void areNumOfTokensFuncConsistent(const char *str, const char *delim)
+{
+ const size_t x = strStrtokNumOfTokens(str, delim);
+ const size_t y = numOfTokens(str, delim);
+
+ if (x == y)
+ printf("PASS: numOfTokens and strStrtokNumOfTokens are consistent!\n");
+ else
+ fprintf(stderr, "FAIL: numOfTokens and strStrtokNumOfTokens are inconsistent!\n"
+ "numOfTokens found %lu tokens, strStrtokNumOfTokens found "
+ "%lu tokens\n", y, x);
+}
+
+void testTokenizer(void)
+{
+ const char *strings = "You've reached the website for Arch Linux"
+ ", a lightweight and flexible Linux® distribution that tries to Keep It Simple."
+ "Currently we have official packages optimized for the i686 and x86-64 architectures."
+ "We complement our official package sets with a community-operated package "
+ "repository that grows in size and quality each and every day.";
+
+ const char *delim = " ,.";
+
+ strTokenizeAndPrint(strings, delim, 31);
+ areNumOfTokensFuncConsistent(strings, delim);
+}
+
+int main(int argc, char **argv)
+{
+#if 0
+ strTokenizeAndPrint("v 1//2", " /", 21);
+ printf("dirname is: \"%s\", basename is: \"%s\"\n", dirname(argv[0]), basename(argv[0]));
+#endif
+
+ char str[] = "oh noes I will get modified by strsep, I'm so scared";
+
+ struct tokenNode *head = strTokenizeLinkedList(str, " ");
+ int i;
+ for (i = 0; head->tok != NULL; i++)
+ {
+ printf("token: %s\n", head->tok);
+ head = head->next;
+ }
+
+
+ return 0;
+}
+