summaryrefslogtreecommitdiff
path: root/AddonsScraper.py
blob: 869e1e8493ed26dfe12807b46c2190f1e022e7db (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import re
import urllib2
import BeautifulSoup

server="https://addons.mozilla.org"
validlicenses=['http://www.gnu.org/licenses/gpl-3.0.html',
'http://www.mozilla.org/MPL/MPL-1.1.html',
'http://www.opensource.org/licenses/bsd-license.php',
'http://www.gnu.org/licenses/gpl-2.0.html',
'http://www.opensource.org/licenses/mit-license.php',
'http://www.mozilla.org/MPL/2.0/']

def normalink(string):
    return re.sub('\?.*', '', string)

def parselist(url):
    l = []
    request =  urllib2.Request(url)
    response = urllib2.urlopen(request)
    soup = BeautifulSoup.BeautifulSoup(response)
    for infodiv in soup.findAll('div',{'class':'info'}):
        for h3 in infodiv.findAll('h3'):
            for link in h3.findAll('a'):
                l.append(re.sub('\?.*', '', link['href']))
    return l


def parsepage(url):
    request =  urllib2.Request(url)
    response = urllib2.urlopen(request)
    soup = BeautifulSoup.BeautifulSoup(response)
    try:
        licenseli =  soup.findAll('li',{'class':'source-license'})[0]
        license = licenseli.findAll('a')[0]['href']
        if license not in validlicenses:
            print "INVALID LICENSE: " + license
            return 0
    except:
        return 0
    print soup.findAll(attrs={"property":"og:title"})[0]['content']
    """print soup.findAll(attrs={"property":"og:description"})[0]['content']
    print normalink(soup.findAll(attrs={"property":"og:image"})[0]['content'])
    print license"""
    addondiv = soup.findAll('div',{'id':'addon'})[0]
    addonp = addondiv.findAll('p',{'class':'install-button'})[0]
    button = addonp.findAll('a')[0]
    print server + normalink(button['href'])

links=[]
for page in range(1,2):
    links = links + parselist(server + "/en-US/firefox/search/?q=+&platform=linux&page=" + str(page))

for link in links:
    parsepage(server+link)