From 1643ba9d4fd90f4a44c5a70207e37c524b2ae665 Mon Sep 17 00:00:00 2001 From: Kyle K Date: Wed, 6 Jul 2011 20:22:06 -0500 Subject: add new try at a tokenizer --- string_tokenizer.c | 353 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 string_tokenizer.c (limited to 'string_tokenizer.c') diff --git a/string_tokenizer.c b/string_tokenizer.c new file mode 100644 index 0000000..2a5998c --- /dev/null +++ b/string_tokenizer.c @@ -0,0 +1,353 @@ +/* string_tokenizer.c + * + * Notes: Seems like strsep and linked list is a way to go, but then original + * string is still destroyed with strsep. So the idea is to work on a copy + * of a string and let strsep to its job. One pitfall of a linked list + * is that accessing data is not as flexible as with arrays + * + */ + +#include +#include +#include +#include + +void str2dArrTest(void); +void strTokenizerTestPrint(char *, const char *); +size_t strStrtokNumOfTokens(const char *, const char *); +char **strTokenize(const char *, const char *, const size_t); +void strTokenizeFree(char **); +void strTokenizeAndPrint(const char *, const char *, const size_t); +extern inline int isDelim(const char, const char *); +size_t numOfTokens(const char *, const char *); +#ifdef _WIN32 +char *strsep(char**, const char*); +#endif + +void areNumOfTokensFuncConsistent(const char *, const char *); +void testTokenizer(void); +void strsepTest(void); + +struct tokenNode +{ + const char *tok; + struct tokenNode *next; +}; + +/* this function assumes that a copy of a str buffer has already been made */ +struct tokenNode *strTokenizeLinkedList(char *str, const char *delim) +{ + if (!str || !delim) + return NULL; + + struct tokenNode *head = (struct tokenNode *) malloc(sizeof(struct tokenNode)); + if (!head ) + { + fprintf(stderr, "strTokenizeLinkedList failed, %s:%d\n", __FILE__, __LINE__); + perror("malloc"); + exit(-1); + } + + char *tok; + struct tokenNode *newNode; + while ((tok = strsep(&str, delim)) != NULL) + { + newNode = (struct tokenNode *) malloc(sizeof(struct tokenNode)); + newNode->tok = tok; + newNode->next = head; + + head = newNode; + } + + return head; +} + +#ifdef _WIN32 +/* this function is insane in a good way */ +char *strsep(char** stringp, const char* delim) +{ + char *result; + + if ((stringp == NULL) || (*stringp == NULL)) + return NULL; + + result = *stringp; + + while (**stringp && !strchr(delim, **stringp)) + ++*stringp; + + if (**stringp) + *(*stringp)++ = '\0'; + else + *stringp = NULL; + + return result; +} +#endif + +void strsepTest(void) +{ + char *buffer, *words[100], *aPtr; + int count = 0, i; + + buffer = strdup("The quick brown fox jumps over the lazy dog"); + if (!buffer) + return; + + while ((aPtr = strsep(&buffer, " ")) && count < 100) + words[count++] = aPtr; + + for (i = 0; i < count; i++) + printf("%s\n", words[i]); +} + +void str2dArrTest(void) +{ + char *str_arr[200]; + + *str_arr = (char *) malloc(sizeof(char *) * 200); + + int i; + for (i = 0; i < 200; i++) + str_arr[i] = (char *) malloc(sizeof(char) * 31); + + strcpy(str_arr[0], "Hello"); + strcpy(str_arr[1], ", world!"); + + printf("%s", str_arr[0]); + printf("%s", str_arr[1]); +} + +/* warning, str will get modified */ +void strTokenizerTestPrint(char *str, const char *delim) +{ + char *pch; + + printf("splitting string \"%s\" into tokens: \n", str); + pch = strtok(str, delim); + + while (pch != NULL) + { + printf("%s ", pch); + pch = strtok(NULL, delim); + } +} + +size_t strStrtokNumOfTokens(const char *str, const char *delim) +{ + if (!str || !delim) + return 0; + + /* make a copy of a string since strtok modifies it */ + char *strcopy = (char *) malloc(strlen(str) + 1); + if (!strcopy) + { + fprintf(stderr, "strStrtokNumOfTokens failed, %s:%d\n", __FILE__, __LINE__); + perror("malloc"); + exit(-1); + } + strcpy(strcopy, str); + + char *pch; + size_t ret; + pch = strtok(strcopy, delim); + + for (ret = 0; pch != NULL; ret++) + pch = strtok(NULL, delim); + + free(strcopy); + + return ret; +} + +/* returns tokenized array of strings / char pointers where the last token + * in the array points to NULL address, this function fills the array of strings + * with a maximum number of maxTokenSize characters if a token exceeds that limit + */ +char **strTokenize(const char *str, const char *delim, const size_t maxTokenSize) +{ + if (maxTokenSize < 1 || maxTokenSize > 100) /* keep it sane */ + return NULL; + + size_t tokensNum; + if ((tokensNum = strStrtokNumOfTokens(str, delim)) == 0) + return NULL; + + /* make a copy of a string since strtok modifies it */ + char *strcopy = (char *) malloc(strlen(str) + 1); + if (!strcopy) + { + fprintf(stderr, "strTokenize failed, %s:%d\n", __FILE__, __LINE__); + perror("malloc"); + exit(-1); + } + strcpy(strcopy, str); + + /* char *tokensArr[tokensNum + 1]; */ + char **tokensArr; + size_t sizeOfTokenPtrArr = sizeof(char *) * (tokensNum + 1); + tokensArr = (char **) malloc(sizeOfTokenPtrArr); + if (!tokensArr) + { + fprintf(stderr, "strTokenize failed, %s:%d\n", __FILE__, __LINE__); + perror("malloc"); + exit(-1); + } + memset(tokensArr, 0, sizeOfTokenPtrArr); + + int i; + char *pch; + for (i = 0; i < tokensNum; i++) + { + tokensArr[i] = (char *) malloc(sizeof(char) * (maxTokenSize + 1)); + if (!(tokensArr[i])) + { + fprintf(stderr, "strTokenize failed, %s:%d\n", __FILE__, __LINE__); + perror("malloc"); + exit(-1); + } + } + + size_t len; + pch = strtok(strcopy, delim); + for (i = 0; pch != NULL; i++) + { + len = strlen(pch); + + if (len <= maxTokenSize) + strcpy(tokensArr[i], pch); + else + { + strncpy(tokensArr[i], pch, maxTokenSize); + tokensArr[i][maxTokenSize] = '\0'; + } + + pch = strtok(NULL, delim); + } + + return tokensArr; +} + +void strTokenizeFree(char **tokenArr) +{ + if (!tokenArr) + return; + + int i; + for (i = 0; tokenArr[i] != NULL; i++) + free(tokenArr[i]); +} + +void strTokenizeAndPrint(const char *strings, const char *delim, const size_t maxTokenLen) +{ + char **tokenized; + + if ((tokenized = strTokenize(strings, delim, maxTokenLen)) == NULL) + { + fprintf(stderr, "strTokenizeAndPrint failed\n"); + exit(-1); + } + + int i; + for (i = 0; tokenized[i] != NULL; i++) + printf("token %d: \"%s\"\n", i+1, tokenized[i]); + + strTokenizeFree(tokenized); +} + +inline int isDelim(const char c, const char *delim) +{ + if (!delim) + return 0; + + size_t delimLen = strlen(delim); + char d; + int i; + for (i = 0; i < delimLen; i++) + { + d = delim[i]; + if (c == d) + return 1; + } + + return 0; +} + +size_t numOfTokens(const char *str, const char *delim) +{ + if (!str || !delim) + return 0; + else if (str[0] == '\0') + return 0; + + size_t numOfTokens = 0; + int delimLast = 0; + int delimCurr = 0; + + int i; + delimLast = isDelim(str[0], delim); + for (i = 1; str[i] != '\0'; i++) + { + delimCurr = isDelim(str[i], delim); + + if (!delimLast && delimCurr) + numOfTokens++; + + delimLast = delimCurr; + } + + if (!delimLast) + numOfTokens++; + + return numOfTokens; +} + +/* test */ +void areNumOfTokensFuncConsistent(const char *str, const char *delim) +{ + const size_t x = strStrtokNumOfTokens(str, delim); + const size_t y = numOfTokens(str, delim); + + if (x == y) + printf("PASS: numOfTokens and strStrtokNumOfTokens are consistent!\n"); + else + fprintf(stderr, "FAIL: numOfTokens and strStrtokNumOfTokens are inconsistent!\n" + "numOfTokens found %lu tokens, strStrtokNumOfTokens found " + "%lu tokens\n", y, x); +} + +void testTokenizer(void) +{ + const char *strings = "You've reached the website for Arch Linux" + ", a lightweight and flexible Linux® distribution that tries to Keep It Simple." + "Currently we have official packages optimized for the i686 and x86-64 architectures." + "We complement our official package sets with a community-operated package " + "repository that grows in size and quality each and every day."; + + const char *delim = " ,."; + + strTokenizeAndPrint(strings, delim, 31); + areNumOfTokensFuncConsistent(strings, delim); +} + +int main(int argc, char **argv) +{ +#if 0 + strTokenizeAndPrint("v 1//2", " /", 21); + printf("dirname is: \"%s\", basename is: \"%s\"\n", dirname(argv[0]), basename(argv[0])); +#endif + + char str[] = "oh noes I will get modified by strsep, I'm so scared"; + + struct tokenNode *head = strTokenizeLinkedList(str, " "); + int i; + for (i = 0; head->tok != NULL; i++) + { + printf("token: %s\n", head->tok); + head = head->next; + } + + + return 0; +} + -- cgit v1.2.3