#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2014 Rubén Rodríguez # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # # This package parses https://addons.mozilla.org and generates a database # listing the addons that are under a valid license. addons.mozilla.org is # itself licensed as "CC Attribution Share-Alike v3.0 or any later version." # Note that screenshots get licensed under the same license as the program. # # To run, install mysql-server, python-mysqldb and python-beautifulsoup # import re import sys import urllib2 import BeautifulSoup server="https://addons.mozilla.org" parsepages=2 #Number of pages per category to parse dbuser="root" dbpass="" validlicenses = ['http://www.gnu.org/licenses/gpl-3.0.html', 'http://www.gnu.org/licenses/gpl-2.0.html', 'http://www.gnu.org/licenses/lgpl-3.0.html', 'http://www.gnu.org/licenses/lgpl-2.1.html', 'http://www.opensource.org/licenses/bsd-license.php', 'http://www.opensource.org/licenses/mit-license.php', 'http://www.mozilla.org/MPL/MPL-1.1.html', 'http://www.mozilla.org/MPL/2.0/'] categories = ['alerts-updates', 'appearance', 'bookmarks', 'download-management', 'feeds-news-blogging', 'games-entertainment', 'language-support', 'photos-music-videos', 'privacy-security', 'search-tools', 'shopping', 'social-communication', 'tabs', 'web-development', 'other'] def normalink(string): return re.sub('\?.*', '', string) def parselist(url): print "PARSING LIST: " + url l = [] request = urllib2.Request(url) response = urllib2.urlopen(request) soup = BeautifulSoup.BeautifulSoup(response) for infodiv in soup.findAll('div',{'class':'info'}): for h3 in infodiv.findAll('h3'): for link in h3.findAll('a'): l.append(re.sub('\?.*', '', link['href'])) return l def parsepage(url, category): request = urllib2.Request(url) response = urllib2.urlopen(request) soup = BeautifulSoup.BeautifulSoup(response) try: licenseli = soup.findAll('li',{'class':'source-license'})[0] license = licenseli.findAll('a')[0]['href'] if license not in validlicenses: if license[0] == "h": print "INVALID LICENSE: " + license return 0 except: return 0 name = re.sub('/$','', normalink(url)) name = re.sub('.*/','', name) prettyname = soup.findAll(attrs={"property":"og:title"})[0]['content'] description = soup.findAll(attrs={"property":"og:description"})[0]['content'] rating = soup.findAll(attrs={"itemprop":"ratingValue"})[0]['content'] popularity = soup.findAll(attrs={"itemprop":"interactionCount"})[0]['content'] popularity = re.sub('UserDownloads:', '', popularity) htmldescription = soup.findAll('div',{'id':'addon-description'})[0] icon = normalink(soup.findAll(attrs={"property":"og:image"})[0]['content']) screenshots = [] try: previewdiv = soup.findAll('ul',{'id':'preview'})[0] for a in previewdiv.findAll('a'): screenshots.append(normalink(a['href'])) except: pass version = soup.findAll('span',{'class':'version-number'})[0].text addondiv = soup.findAll('div',{'id':'addon'})[0] addonp = addondiv.findAll('p',{'class':'install-button'})[0] button = addonp.findAll('a')[0] downloadlink = server + normalink(button['href']) try: homelink = soup.findAll('a',{'class':'home'})[1]['href'] homelink = re.sub('.*//','http://',homelink) except: homelink = "" try: supportlink = soup.findAll('a',{'class':'support'})[0]['href'] supportlink = re.sub('.*//','http://',supportlink) except: supportlink = "" htmldescription = unicode(htmldescription) description = unicode(description) prettyname = unicode(prettyname) description = re.sub('\'', '\\\'', description) htmldescription = re.sub('\'', '\\\'', htmldescription) prettyname = re.sub('\'', '\\\'', prettyname) screenshots = unicode(screenshots) sql = u'INSERT INTO addons.addons (`name`, `prettyname`, `description`, `htmldescription`, `icon`, `screenshots`, `version`, `rating`, `popularity`, `downloadlink`, `homelink`, `supportlink`, `retrievedlink`, `license`, `category`) VALUES ("' +name+ '", \'' +prettyname+ '\', \'' +description+ '\', \'' +htmldescription+ '\', "' +icon+ '", "' +screenshots+ '", "' +version+ '", "' +rating+ '", "' +popularity+ '", "' +downloadlink+ '", "' +homelink+ '", "' +supportlink+ '", "' +url+ '" , "' +license+ '", "' +category+ '");' try: cursor = db.cursor() cursor.execute(sql) cursor.close() db.commit() except: print 'Failed to insert "' + name + '", query: "' + sql else: print "Added " + name + " " + url import MySQLdb db = MySQLdb.connect(host="localhost",user=dbuser,passwd=dbpass, charset="utf8", use_unicode=True) cursor = db.cursor() sql = """DROP DATABASE IF EXISTS addons; CREATE DATABASE addons CHARACTER SET utf8 COLLATE utf8_general_ci; USE addons; CREATE TABLE addons( id INT PRIMARY KEY AUTO_INCREMENT, name VARCHAR(50), prettyname VARCHAR(50), description TEXT, htmldescription TEXT, icon VARCHAR(255), screenshots TEXT, version VARCHAR(20), rating VARCHAR(10), popularity INT, downloadlink VARCHAR(255), homelink VARCHAR(255), supportlink VARCHAR(255), retrievedlink VARCHAR(255), license VARCHAR(255), category VARCHAR(20) ); """ cursor.execute(sql) cursor.close() db.commit() for category in categories: links=[] for page in range(1,1+parsepages): links = links + parselist(server + "/en-US/firefox/extensions/" + category + "/?sort=popular&page="+str(page)) for link in links: parsepage(server+link, category) #tests #parsepage("https://addons.mozilla.org/en-US/firefox/addon/twoo/", "test") #parsepage("https://addons.mozilla.org/en-US/firefox/addon/what-about/", "test")