__author__ = 'raffaelherrmann'

import re, csv
from lxml.html import parse

def reg_match(str, exp):
    p = re.compile(exp)
    res_list = re.findall(p, str)
    if (len(res_list) == 0):
        return 0
    else:
        return res_list[0]

def parse_dom(url):
    result = []
    doc = parse(url).getroot()
    for plugin in doc.cssselect('.product-list__columns-container'):
        name = plugin.cssselect('.product-list__heading a')[0].text_content()
        price = plugin.cssselect('.product-list__price-desktop')[0].text_content().strip(' $')
        ratings = reg_match(plugin.cssselect('.product-list__info-desktop')[0].text_content(), '(\d+) ratings')
        sales = reg_match(plugin.cssselect('.product-list__info-desktop')[0].text_content(), '(\d+) Sales')
        row = name,price,ratings,sales
        result.append(row)
    return result



#start_url = 'http://codecanyon.net/category/wordpress?date=&page={}&price_max=&price_min=&rating_min=&sales=&sort=sales&term=&view=list'
#fname = 'stats_plugins.csv'
start_url = 'http://themeforest.net/category/wordpress?date=&page={}&price_max=&price_min=&rating_min=&sales=&sort=sales&term=&view=list'
fname = 'stats_themes.csv'
datasets = []

for page_num in range(1, 61):
    print('Crawling page {} ...'.format(page_num))
    datasets += parse_dom(start_url.format(page_num))

with open(fname,'w') as out:
    csv_out=csv.writer(out, delimiter=";")
    csv_out.writerow(['name','price','ratings','sales'])
    for row in datasets:
        csv_out.writerow(row)

print("===============\nReady!")