User:Neoconned/LocalRefConverter

This is a crudely hacked version of http://en.wikipedia.org/wiki/User:Cyde/Ref_converter. It will convert an article using the note/ref template referencing system to using the new Cite.php references. Cyde Weys' original version is designed to run on a webserver, and to fetch articles directly from wikipedia. This version runs on your PC, fetches the article to convert from a local text file, and saves the converted article to another local text file. To use:


 * Install perl on your computer.
 * Unlike with the original, you don't need to install any Perl extensions from CPAN.
 * Save the code below into a file called wikirefs.txt in whichever directory Perl sees by default.
 * Put the article wiki source you want to convert into a file called convert_me.txt in that directory.
 * Run perl wikirefs.txt
 * The converted article should be a file called convertedFile.txt in that directory.

my $optLogging = 1; my $optLogLoc = '/var/log/apache2/refconv.log';
 * 1) !/usr/bin/perl
 * 2) "WikiRefs"
 * 3) This program converts  and  to style on Wikipedia.
 * 4) Copyright (C) 2006 Ben "Cyde Weys" McIlwain
 * 5) Trivially modified by Neoconned (SourceWatch) to run locally, May 2007
 * 6) This program is free software; you can redistribute it and/or
 * 7) modify it under the terms of the GNU General Public License
 * 8) as published by the Free Software Foundation; either version 2
 * 9) of the License, or (at your option) any later version.
 * 10) This program is distributed in the hope that it will be useful,
 * 11) but WITHOUT ANY WARRANTY; without even the implied warranty of
 * 12) MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * 13) GNU General Public License for more details.
 * 14) You should have received a copy of the GNU General Public License
 * 15) along with this program; if not, write to the Free Software
 * 16) Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 * 17)         Script configuration options.
 * 18) Set to 1 to enable logging. Set to 0 to disable logging.
 * 1) along with this program; if not, write to the Free Software
 * 2) Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 * 3)         Script configuration options.
 * 4) Set to 1 to enable logging. Set to 0 to disable logging.
 * 1)         Script configuration options.
 * 2) Set to 1 to enable logging. Set to 0 to disable logging.
 * 1)         Script configuration options.
 * 2) Set to 1 to enable logging. Set to 0 to disable logging.
 * 1) Set to 1 to enable logging. Set to 0 to disable logging.
 * 1) Specify the location to store the log (must be writable by your httpd process).
 * 1) Specify the location to store the log (must be writable by your httpd process).

sub writeToLog;

#This accumulates the number of possible things that were incorrect with /. my $numErrors = 0; print 'Getting Wiki source...'; $data_file="convert_me.txt";

{ local $/; open(SLURPING, $data_file) || die("Could not open file!"); $responseContent=; }

my $fullText = $responseContent;

print "\n\n"; print "OPENED FILE OK \n\n";



#This keeps track of the initial length of the article before we make any changes to it. my $preLength = length($fullText);

#These two variables accumulate lines of text and are output at the end. my $warnings = ""; my $verbosage = "";

#Get rid of the "How to add a footnote" comment that this script makes superfluous. if ($fullText =~ m/\<\!\-\-[^\n]*add[^\n]*footnote.*?\-\-\>/s) { $fullText =~ s/\<\!\-\-[^\n]*add[^\n]*footnote.*?\-\-\>//s; $warnings .= "Deleting comment on how to add old footnotes, make sure this was done correctly.\n"; }

# is incredibly broken if ($fullText =~ m/\{\{mnb2/gi) { $warnings .= "Panic, detecting, this article is most likely broken and will need manual repair.\n"; }

#This goes through the article source looking for citation templates that are over one line. This is #necessary because the citation templates must be inserted into the article text inline or things will break. #This has the side-effect of changing citation templates that aren't part of notes. Oh well. #Then we need to detect if any changes have been made, and if they have, print a warning message to that effect. my $tempText = $fullText; $fullText =~ s/(\{\{cite [^\{\}]*?\}\})/my$x=$1;$x=~s{\n}{}g; $x/egs; if ($tempText ne $fullText) { $warnings .= "Detecting multiple line cite, trying to fix, make sure I don't make any mistakes.\n"; }

#Get a list of all matches of and  and  and

my @matches = ($fullText =~ m/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*([^\|]*?)\s*(?:\|\s*[^\|\}]*?\s*)*?\}\}/gi); #push @matches, ($fullText =~ m/\{\{mn\s*\|\s*([^\|]*?)\s*\|\s*[^\|\}]*?\s*\}\}/gi);

#If there are no s in the article then there's no point in continuing. if ($#matches > -1) {

### This next little section creates @matchesSingle, which consists of @matches minus # any duplicate entries, and @matchesMult, which consists of a list of single entries # of things that did have duplicate entries. It also removes duplicate entries from @matches. # In other words, if @matches was [a,a,b,c,d,d,e], then: # @matches = [a,b,c,d,e] # @matchesSingle = [b,c,e] # @matchesMult = [a,d]

my %tempHash; my %multHash; foreach (@matches) { #Note: lc turns all the characters of a string into their lowercase counterparts._ if (exists $tempHash{lc($_)}) { $multHash{lc($_)} = lc($_); }		else { $tempHash{lc($_)} = lc($_); }	   }	    @matches = sort values %tempHash; my @matchesMult = sort values %multHash;

#Subtract set @matchesMult from set @matchesSingle foreach (@matchesMult) { delete $tempHash{$_}; }	   my @matchesSingle = sort values %tempHash;

#	   # End complicated section. ###

if ($#matchesMult >= 0) { $warnings .= "Detecting multiple refs with the same name, make sure I handle this correctly.\n"; }

#refCoors is the hash between ref name and note text. my %refCorrs = ; my $finalText = ""; my $firstMatch = 1; my $matched = 0;

#Split the full Wiki source into discrete lines and process them sequentially to see if #each line contains a or a. If the line does contain a , #match it up in the hash with its appropriate ref. If it doesn't match, throw a warning #and comment it out. If it did match, remove it, and replace all removed s with a single foreach (split /\n/, $fullText) { my $thisLine = $_; $matched = 0;

#Loop through each of the ref names to see if it matches with any notes on this line. This has O(n*m) efficiency. foreach (@matches) { if ($thisLine =~ m/\{\{(?:mnb2?|note(?:[_ ]label)?)\s*\|\s*\Q$_\E\s*(?:\|\s*[^\{\}]*?\s*)*\}\}\s*(.*)$/i) { my $thisMatch = $1; if ($thisMatch =~ m/(\{\{note[_ ]label[^\}\{]*?\}\})/i) { $thisMatch =~ s/\{\{note[_ ]label\s*[^\}\{]*?\}\}//gi; }

#Chop off leading and trailing spaces. $thisMatch =~ s/^\s+//; $thisMatch =~ s/\s+$//; $verbosage .= "Matching up ref \"$_\", removing from list, note is: $thisMatch\n"; $refCorrs{$_} = $thisMatch; $matched = 1;

#firstMatch is used to keep track of the first note that has been replaced. The first note is replaced #with and the rest are just deleted. if ($firstMatch == 1) { if ($fullText !~ m/\/g) { if ($smallFont eq "on") { $finalText .= ' '. "\n"; }				else { $finalText .= " \n"; }			   }			    $firstMatch = 0; }		   }		}

#If this line had a note with no corresponding ref, comment it out and print a warning message. if ($matched == 0) { if ($thisLine =~ m/\{\{(?:mnb2?|note)\s*\|\s*([^\|]*?)\s*\|?\s*\}\}\s*(.*)$/i) { $warnings .= "Note \"$1\" isn\'t referenced, commenting out, link was: $2\n"; $numErrors++; $finalText .= "\n"; }		   else { $finalText = $finalText. $thisLine. "\n"; }		}	   }

my $currMatch = "";

#Go through and replace references that were only referenced once with a simple into the article. $numErrors++; $warnings .= "Found a blank note, ref is \"$currMatch\"\n"; }		else { $numErrors++; $warnings .= "Ref \"$currMatch\" doesn\'t exist in notes. Turning into \{\{citation needed\}\}\n"; }	   }

#Now we need to go through and replace references that were referenced multiple times. #We need to name our references now. foreach $currMatch (@matchesMult) { if (exists $refCorrs{$currMatch} && $refCorrs{$currMatch} !~ m/^\s*$/) { #Cite.php returns an error if the refName is an integer value, so we'll pad it out with a character. my $refName = $currMatch; if ($refName =~ m/^\d+$/) { $refName = 'ref'. $refName; }		   $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\$refCorrs{$currMatch}\<\/ref\>/i; $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\/gi; $verbosage .= "Replacing multiply referenced \"$refName\" with full notes: \$refCorrs{$currMatch}\<\/ref\>\n"; }		elsif (exists $refCorrs{$currMatch} && $currMatch =~ m/^\s*$/) { #Deal with blank notes. We don't want to be inserting into the article. $numErrors++; $warnings .= "Found a blank multiply referenced note, ref is \"$currMatch\"\n"; }		else { $numErrors++; $warnings .= "Multiple reference \"$currMatch\" doesn\'t exist in notes. Turning into \{\{citation needed\}\}\n"; }	   }	    #One more loop through any remaining  tags to turn them into. $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*[^\|]*?\s*(?:\|[^\|\}]*?\s*)*?\}\}/\{\{citation needed\}\}/gi;

#Remove excess spaces that we may have just made by deleting the content inbetween. if ($finalText =~ m/\n{4,}/gs) { $warnings .= "I think I have found too many consecutive newlines, I am going to remove them, make sure I did this right.\n"; $finalText =~ s/\n{4,}/\n\n/gs; }

#Final sanity checks if ($finalText =~ m/\{\{ref/gi) { $warnings .= "Failing sanity check, there may still be some s left.\n"; }	   if ($finalText =~ m/\{\{note/gi) { $warnings .= "Failing sanity check, there may still be some s left.\n"; }	   if ($finalText =~ m/\{\{mn/gi) { $warnings .= "Failing sanity check, there may still be some Footnote4 stuff left ( or ).\n"; }

print 'Finished. ' . "\n";

print '\n\n\n'; print 'WRITING OUTPUT FILE \n\n';
 * 1) nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn

open OUT, "> convertedFile.txt" or die "Can't open $outfile : $!"; print OUT $finalText;
 * 1) nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn

}	else { }

sub writeToLog { }
 * 1) Writes log output to a file.