diff options
author | Ruben Rodriguez <ruben@gnu.org> | 2014-10-21 01:10:26 +0200 |
---|---|---|
committer | Ruben Rodriguez <ruben@gnu.org> | 2014-10-21 01:10:26 +0200 |
commit | c47f3dda10695dc1e71851e8cf7fdfb99d9d1c66 (patch) | |
tree | 2a44d64548885f28939d4f2fc17f9759849c3b57 /tools/AddonsScraper.py | |
parent | 14e6d0ed4be07a4d6bf94a141af83b7d60f1ac5f (diff) |
Added tools dir
Diffstat (limited to 'tools/AddonsScraper.py')
-rw-r--r-- | tools/AddonsScraper.py | 169 |
1 files changed, 169 insertions, 0 deletions
diff --git a/tools/AddonsScraper.py b/tools/AddonsScraper.py new file mode 100644 index 0000000..3813252 --- /dev/null +++ b/tools/AddonsScraper.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2014 Rubén Rodríguez <ruben@gnu.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +# +# This package parses https://addons.mozilla.org and generates a database +# listing the addons that are under a valid license. addons.mozilla.org is +# itself licensed as "CC Attribution Share-Alike v3.0 or any later version." +# Note that screenshots get licensed under the same license as the program. +# +# To run, install mysql-server, python-mysqldb and python-beautifulsoup +# + + +import re +import sys +import urllib2 +import BeautifulSoup + +server="https://addons.mozilla.org" +parsepages=2 #Number of pages per category to parse +dbuser="root" +dbpass="" + +validlicenses = ['http://www.gnu.org/licenses/gpl-3.0.html', +'http://www.gnu.org/licenses/gpl-2.0.html', +'http://www.gnu.org/licenses/lgpl-3.0.html', +'http://www.gnu.org/licenses/lgpl-2.1.html', +'http://www.opensource.org/licenses/bsd-license.php', +'http://www.opensource.org/licenses/mit-license.php', +'http://www.mozilla.org/MPL/MPL-1.1.html', +'http://www.mozilla.org/MPL/2.0/'] + +categories = ['alerts-updates', 'appearance', 'bookmarks', 'download-management', 'feeds-news-blogging', 'games-entertainment', 'language-support', 'photos-music-videos', 'privacy-security', 'search-tools', 'shopping', 'social-communication', 'tabs', 'web-development', 'other'] + +def normalink(string): + return re.sub('\?.*', '', string) + +def parselist(url): + print "PARSING LIST: " + url + l = [] + request = urllib2.Request(url) + response = urllib2.urlopen(request) + soup = BeautifulSoup.BeautifulSoup(response) + for infodiv in soup.findAll('div',{'class':'info'}): + for h3 in infodiv.findAll('h3'): + for link in h3.findAll('a'): + l.append(re.sub('\?.*', '', link['href'])) + return l + + +def parsepage(url, category): + request = urllib2.Request(url) + response = urllib2.urlopen(request) + soup = BeautifulSoup.BeautifulSoup(response) + try: + licenseli = soup.findAll('li',{'class':'source-license'})[0] + license = licenseli.findAll('a')[0]['href'] + if license not in validlicenses: + if license[0] == "h": + print "INVALID LICENSE: " + license + return 0 + except: + return 0 + name = re.sub('/$','', normalink(url)) + name = re.sub('.*/','', name) + prettyname = soup.findAll(attrs={"property":"og:title"})[0]['content'] + description = soup.findAll(attrs={"property":"og:description"})[0]['content'] + rating = soup.findAll(attrs={"itemprop":"ratingValue"})[0]['content'] + popularity = soup.findAll(attrs={"itemprop":"interactionCount"})[0]['content'] + popularity = re.sub('UserDownloads:', '', popularity) + htmldescription = soup.findAll('div',{'id':'addon-description'})[0] + icon = normalink(soup.findAll(attrs={"property":"og:image"})[0]['content']) + screenshots = [] + try: + previewdiv = soup.findAll('ul',{'id':'preview'})[0] + for a in previewdiv.findAll('a'): + screenshots.append(normalink(a['href'])) + except: + pass + version = soup.findAll('span',{'class':'version-number'})[0].text + addondiv = soup.findAll('div',{'id':'addon'})[0] + addonp = addondiv.findAll('p',{'class':'install-button'})[0] + button = addonp.findAll('a')[0] + downloadlink = server + normalink(button['href']) + try: + homelink = soup.findAll('a',{'class':'home'})[1]['href'] + homelink = re.sub('.*//','http://',homelink) + except: + homelink = "" + try: + supportlink = soup.findAll('a',{'class':'support'})[0]['href'] + supportlink = re.sub('.*//','http://',supportlink) + except: + supportlink = "" + + htmldescription = unicode(htmldescription) + description = unicode(description) + prettyname = unicode(prettyname) + description = re.sub('\'', '\\\'', description) + htmldescription = re.sub('\'', '\\\'', htmldescription) + prettyname = re.sub('\'', '\\\'', prettyname) + screenshots = unicode(screenshots) + + sql = u'INSERT INTO addons.addons (`name`, `prettyname`, `description`, `htmldescription`, `icon`, `screenshots`, `version`, `rating`, `popularity`, `downloadlink`, `homelink`, `supportlink`, `retrievedlink`, `license`, `category`) VALUES ("' +name+ '", \'' +prettyname+ '\', \'' +description+ '\', \'' +htmldescription+ '\', "' +icon+ '", "' +screenshots+ '", "' +version+ '", "' +rating+ '", "' +popularity+ '", "' +downloadlink+ '", "' +homelink+ '", "' +supportlink+ '", "' +url+ '" , "' +license+ '", "' +category+ '");' + try: + cursor = db.cursor() + cursor.execute(sql) + cursor.close() + db.commit() + except: + print 'Failed to insert "' + name + '", query: "' + sql + else: + print "Added " + name + " " + url + +import MySQLdb +db = MySQLdb.connect(host="localhost",user=dbuser,passwd=dbpass, charset="utf8", use_unicode=True) +cursor = db.cursor() +sql = """DROP DATABASE IF EXISTS addons; +CREATE DATABASE addons CHARACTER SET utf8 COLLATE utf8_general_ci; +USE addons; +CREATE TABLE addons( +id INT PRIMARY KEY AUTO_INCREMENT, +name VARCHAR(50), +prettyname VARCHAR(50), +description TEXT, +htmldescription TEXT, +icon VARCHAR(255), +screenshots TEXT, +version VARCHAR(20), +rating VARCHAR(10), +popularity INT, +downloadlink VARCHAR(255), +homelink VARCHAR(255), +supportlink VARCHAR(255), +retrievedlink VARCHAR(255), +license VARCHAR(255), +category VARCHAR(20) +); +""" +cursor.execute(sql) +cursor.close() +db.commit() + +for category in categories: + links=[] + for page in range(1,1+parsepages): + links = links + parselist(server + "/en-US/firefox/extensions/" + category + "/?sort=popular&page="+str(page)) + for link in links: + parsepage(server+link, category) + +#tests +#parsepage("https://addons.mozilla.org/en-US/firefox/addon/twoo/", "test") +#parsepage("https://addons.mozilla.org/en-US/firefox/addon/what-about/", "test") |