diff options
author | Kyle K <kylek389@gmail.com> | 2017-03-26 19:55:06 -0500 |
---|---|---|
committer | Kyle K <kylek389@gmail.com> | 2017-03-26 19:55:06 -0500 |
commit | 007a3ca9ea7c23f14341da104de47ecc919dfa77 (patch) | |
tree | c71dbc4ec3acb12ba16e5aeb88b296730497b16d | |
parent | 824bcb51245ae4b693d984c63f58506389371a5e (diff) | |
download | PythonPractice-007a3ca9ea7c23f14341da104de47ecc919dfa77.tar.gz PythonPractice-007a3ca9ea7c23f14341da104de47ecc919dfa77.tar.bz2 PythonPractice-007a3ca9ea7c23f14341da104de47ecc919dfa77.zip |
cl.py is a module that will scrape craigslist and display first 120 WRX STi car parts for sale
-rw-r--r-- | cl.py | 49 |
1 files changed, 49 insertions, 0 deletions
@@ -0,0 +1,49 @@ +#!/usr/bin/env python + +import argparse +import sys +import requests +from bs4 import BeautifulSoup + + +def query_craigslist(baseurl=None, keyword='wrx|sti'): + if baseurl is None: + baseurl = 'https://chicago.craigslist.org/' + response = requests.get(baseurl + 'search/pta', params={'query': keyword, 'srchType': 'T'}) + soup = BeautifulSoup(response.content, "html.parser") + + results = soup.find_all('li', {'class': 'result-row'}) # at max 120 results per 1 page + items = [] + + for i in results: + try: + title = i.find('a', {'class': 'result-title hdrlnk'}).get_text() + link = i.find('a', {'class': 'result-title hdrlnk'}).get('href') + date = i.find('time', {'class': 'result-date'}).get('datetime') + price = i.find('span', {'class': 'result-price'}).get_text() + hood = i.find('span', {'class': 'result-hood'}).get_text() + imgs = i.find('a', {'class': 'result-image gallery'}).get('data-ids').split(',') + img_1 = 'https://images.craigslist.org/' + imgs[0][2:] + '_300x300.jpg' + d = {'title': title, 'link': link, 'date': date, 'price': price, 'hood': hood, 'img': img_1} + items.append(d) + except AttributeError: + pass # ignore empty fields + + return items + + +def main(): + parser = argparse.ArgumentParser(description="craigslist WRX and STi parts finder", parents=()) + parser.add_argument("-b", "--baseurl", help='baseurl, e.g. https://chicago.craigslist.org/') + parser.add_argument("-k", "--keyword", default='wrx|sti', help='keyword to search') + + args, extra_args = parser.parse_known_args() + partlist = query_craigslist(args.baseurl, args.keyword) + print(partlist) + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + pass |