summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKyle K <kylek389@gmail.com>2012-09-18 20:09:55 -0500
committerKyle Kaminski <kyle@kkaminsk.com>2012-09-18 20:09:55 -0500
commite7a8b308273e74279f04ca6635c1d4aa00a842b1 (patch)
treec306225c92f878e92e33060acb40cc59cde2b104
parent7b72a83767b303378068bb1e354755d47b70c108 (diff)
downloadmining-e7a8b308273e74279f04ca6635c1d4aa00a842b1.tar.gz
mining-e7a8b308273e74279f04ca6635c1d4aa00a842b1.tar.bz2
mining-e7a8b308273e74279f04ca6635c1d4aa00a842b1.zip
add a regexp file
-rw-r--r--regexp.txt15
1 files changed, 15 insertions, 0 deletions
diff --git a/regexp.txt b/regexp.txt
new file mode 100644
index 0000000..7cef9cb
--- /dev/null
+++ b/regexp.txt
@@ -0,0 +1,15 @@
+[regexp]
+- difference between ([^c])+ and ([^c]+) heh :p? first matches last letter occurence, latter matches whole string
+
+- grep a cl page
+# grep -E "^\\s*<a\\s+href\\s*=\\s*['\"]+([^'\"])+['\"]\\s*>\\s*([^<])+</a>\\s*$" cl.html
+
+- same but using sed, notes how forward-slash in '</a>' got escaped
+$ sed -n -r "/^\\s*<a\\s+href\\s*=\\s*['\"]+([^'\"])+['\"]\\s*>\\s*([^<])+<\/a>\\s*$/p" cl.html
+
+- now sed with 2 column output, link mapping to desc, note that +'s were moved into ()
+$ sed -r -n "s/^\\s*<a\\s+href\\s*=\\s*['\"]+([^'\"]+)['\"]\\s*>\\s*([^<]+)<\/a>\\s*$/\1 \2/p" cl.html
+
+- full cl search
+$ curl -s -i 'http://chicago.craigslist.org/search/pta?query=wrx+|+sti+|+impreza+|+subaru&srchType=T' | sed -r -n "s/^\\s*<a\\s+href\\s*=\\s*['\"]+([^'\"]+)['\"]\\s*>\\s*([^<]+)<\/a>\\s*$/\1 \2/p"
+