import wikipedia, pagegenerators, re, urllib, urllib2
"""
This is a python script written using the pywikipediabot framework. You can find out more about pywikipedia at http://meta.wikimedia.org/wiki/pywikipediabot
This script automatically creates articles after queries ITIS for information.
Released under Creative Commons Attribution-Noncommercial-Share Alike 3.0 United States License
(C) Monobi 2008
"""
def getsource(pagereq):
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
header = {'User-Agent': user_agent}
req = urllib2.Request(pagereq, None, header)
response = urllib2.urlopen(req)
page = response.read()
return page
def main():
#########################
# 1 = on, 0 = off #
#########################
on = 0
############################################
# Various regexes for matching needed data #
############################################
founder_species = re.compile(" <B>(?P<author>.*?)</B>", re.I)
vn_es_template = re.compile("width=\"(.*?)\W\">(?P<spanish>.*?)\[Spanish\]", re.I)
vn_en_template = re.compile("width=\"(.*?)\W\">(?P<english>.*?)\[English\]", re.I)
genus_template = re.compile("Genus</TD><TD class=datafield vAlign=top width=\"71%\"><A HREF=\"SingleRpt\?search_topic=TSN&search_value=.*\">(?P<template>.*?)</A>", re.I)
subspecies_template = re.compile("Subspecies</TD><TD class=datafield vAlign=top width=\"71%\"><A HREF=\"SingleRpt\?search_topic=TSN&search_value=(.*?)\">(?P<subspeciesname>.*?)</A>", re.I)
serial_number = re.compile("Taxonomic Serial No\.\: (?P<ID>.*?)</SPAN>", re.I)
no_records = re.compile("No Records Found", re.I)
article_match = re.compile(r"\[\[(.*?)\]\]", re.I)
##############################################################
# Where the bot downloads the list of articles to be created #
##############################################################
pagelist = "speciespages.txt"
######################
# Site (wikispecies) #
######################
site = wikipedia.getSite('species', 'species')
###################################
# Location of pages to be created #
###################################
to = wikipedia.Page(site, 'User:MonoBot/Requested Articles')
totext = to.get()
open(pagelist, 'w').write(totext)
for line in open(pagelist,'r'):
prelim_article = article_match.findall(line)
for article_name in prelim_article:
try:
###################################################################
# Spanish and English {{VN}} template, None (no match) by default #
###################################################################
spanish_vn = None
english_vn = None
founder = None
##############################
# Send query to ITIS website #
##############################
article_name = article_name.replace(' ', '+')
data_page = getsource("http://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=all&search_kingdom=every&search_span=exactly_for&search_value=%s&categories=All&source=html&search_credRating=All&Go=Search" % article_name)
##########################
# Get the genus template #
##########################
if not re.search(no_records, data_page, re.I):
template_getter = re.search(genus_template, data_page)
if template_getter:
my_template = template_getter.group('template')
##########################
# Get the ITIS ID number #
##########################
id_getter = re.search(serial_number, data_page)
if id_getter:
my_id = id_getter.group('ID')
############################
# Working on subspecies... #
############################
subspecies_list = re.search(subspecies_template, data_page)
if subspecies_list:
subspecies_individual = subspecies_list.group('subspeciesname')
subspecies_individual = ''.join(subspecies_list)
################################
# Get the |es= part for {{VN}} #
################################
get_spanish_vn = re.search(vn_es_template, data_page)
if get_spanish_vn:
spanish_vn = get_spanish_vn.group('spanish')
spanish_vn_list = spanish_vn.split()
spanish_vn = spanish_vn_list[0].title() + " " + spanish_vn_list[1]
################################
# Get the |en= part for {{VN}} #
################################
get_english_vn = re.search(vn_en_template, data_page)
if get_english_vn:
english_vn = get_english_vn.group('english')
english_vn_list = english_vn.split()
english_vn = english_vn_list[0].title() + " " + english_vn_list[1]
############################
# Get the authors and date #
############################
get_founder_search = re.search(founder_species, data_page)
if get_founder_search:
founder = get_founder_search.group('author')
founder = founder.split()
final = ''
for founders in founder:
final += u" [[%s]]" % founders
final = final.replace("[[and]]", "&")
final = final.replace("[[in]]", "in ")
final = final.replace("(", "")
final = final.replace(")", "")
final = final.replace(",", "")
#final = re.sub(r"\[\[(\d{4})\]\]", r"\1", final) <--- first
final = re.sub(r"\[\[(\d{4})\]\]", r"\1", final)
final = final.replace(" [[", "[[")
final = final.replace("&[[", "& [[")
final = final.replace("[[van]]", "van")
# final = re.sub(r"\]\]\s \d{4}", r"\]\]\s, \1", final)
final = re.sub('\]\] (\d{4})', r']], \1', final)
final = final.replace("]][[", "]] [[")
founder = final
#wikipedia.output(founder)
####################################################
# Deal with different regexes if no match is found #
####################################################
if spanish_vn == None:
spanish_vn = str(spanish_vn)
spanish_vn = spanish_vn.replace("None", "")
if english_vn == None:
english_vn = str(english_vn)
english_vn = english_vn.replace("None", "")
if founder == None:
founder = str(founder)
founder = founder.replace("None", "")
###############################################
# Change the article_name back to into a page #
###############################################
article_name = article_name.replace("+", " ")
article_name = wikipedia.Page(site, article_name)
##############################################
# Define all of the information to be placed #
##############################################
taxonav = "==Taxonavigation==\n{{%s}}\nSpecies: ''[[%s]]''" % (my_template, article_name.titleWithoutNamespace())
name = "\n\n==Name==\n''{{subst:PAGENAME}}'' (%s)\n\n" % founder
references = "==References==\n*{{ITIS|%s}}\n\n" % my_id
vern_names = "==Vernacular names==\n{{VN\n|en=%s\n|es=%s}}" % (english_vn, spanish_vn)
###################################################################
# if the article doesnt exist & es/en item found & it's turned on #
###################################################################
if not article_name.exists() and english_vn or spanish_vn != "" and on == 1:
newtext = taxonav + name + references + vern_names
wikipedia.output(newtext)
article_name.put_async(newtext, 'Automatically creating species page', True)
######################################################################
# if the article doesnt exist & no es/en item found & it's turned on #
######################################################################
if not article_name.exists() and founder == "" and on == 1:
newtext2 = taxonav + references
wikipedia.output(newtext2)
article_name.put_async(newtext2, 'Automatically creating species page', True)
if not article_name.exists() and founder != "":
newtext3 = taxonav + name + references
article_name.put_async(newtext3, 'Automatically creating species page', True)
#####################################
# if the article exist or it is off #
#####################################
if article_name.exists() or on != 1 or on == 0:
pass
except wikipedia.Error:
wikipedia.output(u"Error on %s" % article_name)
continue
except UnicodeDecodeError:
wikipedia.output(u"Error with unicode, skipping...")
continue
###################################################
# Same page as the list of articles to be created #
# This cleans up the created articles #
###################################################
clean_page = wikipedia.Page(site, 'User:MonoBot/Requested Articles')
try:
clean_text = clean_page.get()
cleaner_text = ":'''Admins''': ''Pleases look at the [<!-- -->[User:MonoBot/Instruction]<!-- -->] page before adding any pages here''\n"
clean_page.put(cleaner_text, 'Cleaning list of processed articles', True)
except wikipedia.LockedPage:
wikipedia.output(u"Can't save page, don't have permission")
except wikipedia.Error:
wikipedia.output(u"Error")
if __name__ == '__main__':
try:
main()
finally:
wikipedia.stopme()