# # myweb.pl # # Andrew's Comics and TV summary # v0.0: Andrew Glassner, March 26 # v0.1: AG March 27, 2003 # Very rough, generally working # v0.2: AG March 29, 2003 # Brought together all the fetches into "getItem". # Made utility proc for United Media cartoons # Made some of the TV show searchs a little more robust # Added comments. # Added lots of function, cleaned things up # v0.3: AG April 4, 2003 # Added a couple of talk shows # Removed comments in getItem (this removed some false hits) # v0.4: AG April 15, 2003 # Added documentation and comments # v0.5: EH April 15, 2003 # Added too many new comics, made getItem more general # # # HOW TO USE THIS SCRIPT: # # I have two shortcuts on my desktop. One is a shortcut to this # file, named myweb.pl. The other is a shortcut to the web page # it makes (the one pointed to by $webFile below). Obviously you # have to run the script once to make that file so you can then # make a shortcut to it. Even though the contents of the page # get updated every day, the name stays the same, so the # shortcut stays good. # # Every morning, I double-click on the shortcut to the script # It opens up a command window, runs perl, and builds the web # page. Then I double-click on the shortcut to the web file, # it pops up in my browser, and I read and enjoy. # # To customize the script, you need to do three things, named # under "ATTENTION" a few lines below here. # # This script requires perl. You can get a free copy of perl # from http://www.activestate.com/Products/ActivePerl/ # Just download and install it; nothing special is required. # # These modules are typically installed as part of perl. # If yours are out of date, get new ones from www.cpan.org require LWP::UserAgent; require HTTP::Request; # # ATTENTION # # set these three things # 1. myName is your name! # 2. webFile is the path for the output web page # 3. pick the content you want in the subroutine getWebContent below # $myName = "Eric"; $webFile = "C:/my documents/perl/myweb.htm"; # note: use forward slashes &makeWebPage; exit(1); # # The items below get displayed in the order they're named. # To remove items, either delete them from the list, or turn them # into a comment by putting a # sign at the start of the line. # sub getWebContent { &getDilbert; &getFoxTrot; &getBizarro; &getForBetterOrForWorse; &getDoonesbury; &getTomTheDancingBug; &getStoryMinute; &getTroubletown; &getPearslBeforeSwine; &getTheKChronicles; &getDorkTower; &getNodwick; &getThisModernWorld; &getDTWOF; &getPatOliphant; &getTomToles; &getTedRall; &getAnnTelnaes; &getDavidHorsey; &getTonyAuth; &getSteveSack; &getClayBennett; &getKirkAnderson; &getCharlieRose; &getJonStewart; &getJayLeno; &getDavidLetterman; &getConanOBrien; &getCraigKilborn; &get9ChickweedLane; &getAgnes; &getAlleyOop; &getAndyCapp; &getArloJanis; &getBC; &getBachelorParty; &getBallardStreet; &getBen; &getBetty; &getBigNate; &getBornLoser; &getBuckets; &getBullsNBears; &getCharlie; &getCheapThrills; &getCommitted; &getDrabble; &getDrawingACrowd; &getFatCats; &getFerdnand; &getFlightDeck; &getFloNFriends; &getFrankAndErnest; &getFrazz; &getGeech; &getGetFuzzy; &getGingerMeggs; &getGoFigure; &getGraffiti; &getGrandAvenue; &getTheGrizzwells; &getHeathcliff; &getHerbAndJamaal; &getHerman; &getJanesWorld; &getJumpStart; &getKitNCarlyle; &getLibertyMeadows; &getLilAbner; &getLuann; &getLupoAlberto; &getMarmaduke; &getMeatloafNight; &getMeg; &getModeratelyConfused; &getMomma; &getMonty; &getMotley; &getNancy; &getNaturalSelection; &getOffTheMark; &getOneBigHappy; &getTheOtherCoast; &getOutOfTheGenePool; &getOverTheHedge; &getPCAndPixel; &getPeanuts; &getPearslBeforeSwine; &getPibgorn; &getPickles; &getPotluckParish; &getRaisingDuncan; &getRealityCheck; &getRedAndRover; &getRipleys; &getRoseIsRose; &getRubes; &getRudyPark; &getSheldon; &getShirleyAndSon; &getSoupToNutz; &getSpeedBump; &getStockcarToons; &getStrangeBrew; &getTarzan; &getThatsLife; &getTopOfTheWorld; &getWizardOfId; &getWorkingDaze; &getRobertAriail; &getChuckAsay; &getSteveBenson; &getRandyBish; &getChipBok; &getBillDay; &getJerryHolbert; &getEttaHulme; &getDrewLitton; &getMikeLuckovich; &getHenryPayne; &getRobRogers; &getBillSchorr; &getMikeSmith; &getJeffStahler; &getEdStein; &getPaulSzep; &getGaryVarvel; } ########################################### ###### Individual content fetches ########################################### # # United Comics: their consistent form makes life easier # sub getForBetterOrForWorse { &getUnitedComic("For Better Or For Worse", "forbetterorforworse", "fb"); } sub getTomTheDancingBug { &getUnitedComic("Tom The Dancing Bug", "tomthedancingbug", "td"); } sub getDoonesbury { &getUnitedComic("Doonesbury", "doonesbury", "db"); } sub getFoxTrot { &getUnitedComic("FoxTrot", "foxtrot", "ft"); } sub getPatOliphant { &getUnitedComic("Pat Oliphant", "patoliphant", "po"); } sub getTomToles { &getUnitedComic("Tom Toles", "tomtoles", "tt"); } sub getTedRall { &getUnitedComic("Ted Rall", "tedrall", "tr"); } sub getAnnTelnaes { &getUnitedComic("Ann Telnaes", "anntelnaes", "tmate"); } sub getTonyAuth { &getUnitedComic("Tony Auth", "tonyauth", "ta"); } sub getSteveSack { &getUnitedComic("Steve Sack", "stevesack", "tmssa"); } sub getDavidHorsey { &getUnitedComic("David Horsey", "davidhorsey", "tmdho"); } # United Media comics #sub getDilbert { &getUnitedMedia("Dilbert","comics/dilbert","gif"); } sub get9ChickweedLane { &getUnitedMedia("9 Chickweed Lane","comics/chickweed","jpg"); } sub getAgnes { &getUnitedMedia("Agnes","creators/agnes","gif"); } sub getAlleyOop { &getUnitedMedia("Alley Oop","comics/agnes","gif"); } sub getAndyCapp { &getUnitedMedia("Andy Capp","creators/andycapp","gif"); } sub getArloJanis { &getUnitedMedia("Arlo & Janis","comics/arlonjanis","gif"); } sub getBC { &getUnitedMedia("B. C.","creators/bc","gif"); } sub getBachelorParty { &getUnitedMedia("Bachelor Party","creators/bachelorparty","gif"); } sub getBallardStreet { &getUnitedMedia("Ballard Street","creators/ballardst","gif"); } sub getBen { &getUnitedMedia("Ben","comics/ben","gif"); } sub getBetty { &getUnitedMedia("Betty","comics/betty","gif"); } sub getBigNate { &getUnitedMedia("Big Nate","comics/bignate","gif"); } sub getBornLoser { &getUnitedMedia("The Born Loser","comics/bornloser","gif"); } sub getBuckets { &getUnitedMedia("The Buckets","comics/buckets","gif"); } sub getBullsNBears { &getUnitedMedia("Bull\$ N Bear\$","comics/bullsnbears","gif"); } sub getCharlie { &getUnitedMedia("Charlie","creators/charlie","gif"); } sub getCheapThrills { &getUnitedMedia("Cheap Thrills","wash/cheapthrills","gif"); } sub getCommitted { &getUnitedMedia("Committed","comics/committed","gif"); } sub getDrabble { &getUnitedMedia("Drabble","comics/drabble","gif"); } sub getDrawingACrowd { &getUnitedMedia("Drawing a Crowd","creators/drawingacrowd","gif"); } sub getFatCats { &getUnitedMedia("Fat Cats","comics/fatcats","gif"); } sub getFerdnand { &getUnitedMedia("Fred'nand","comics/frednand","gif"); } sub getFlightDeck { &getUnitedMedia("Flight Deck","creators/flightdeck","gif"); } sub getFloNFriends { &getUnitedMedia("Flo & Friends","creators/floandfriends","gif"); } sub getFrankAndErnest { &getUnitedMedia("Frank & Ernest","comics/franknernest","jpg"); } sub getFrazz { &getUnitedMedia("Frazz","comics/frazz","gif"); } sub getGeech { &getUnitedMedia("Geech","comics/geech","gif"); } sub getGetFuzzy { &getUnitedMedia("Get Fuzzy","comics/getfuzzy","gif"); } sub getGingerMeggs { &getUnitedMedia("Ginger Meggs","comics/gingermeggs","gif"); } sub getGoFigure { &getUnitedMedia("Go Figure","comics/gofigure","gif"); } sub getGraffiti { &getUnitedMedia("Graffiti","comics/graffiti","gif"); } sub getGrandAvenue { &getUnitedMedia("Grand Avenue","comics/grandave","gif"); } sub getTheGrizzwells { &getUnitedMedia("The Grizzwells","comics/grizzwells","gif"); } sub getHeathcliff { &getUnitedMedia("Heathcliff","creators/heathcliff","gif"); } sub getHerbAndJamaal { &getUnitedMedia("Herb and Jamaal","creators/herbnjamaal","gif"); } sub getHerman { &getUnitedMedia("Herman","comics/herman","jpg"); } sub getJanesWorld { &getUnitedMedia("Jane's World","comics/janesworld","gif"); } sub getJumpStart { &getUnitedMedia("Jump Start","comics/jumpstart","gif"); } sub getKitNCarlyle { &getUnitedMedia("Kit 'N' Carlyle","comics/kitncarlyle","gif"); } sub getLibertyMeadows { &getUnitedMedia("Liberty Meadows","creators/liberty","gif"); } sub getLilAbner { &getUnitedMedia("Li'l Abner","comics/lilabner","gif"); } sub getLuann { &getUnitedMedia("Luann","comics/luann","gif"); } sub getLupoAlberto { &getUnitedMedia("Lupo Alberto","comics/lupo","gif"); } sub getMarmaduke { &getUnitedMedia("Marmaduke","comics/marmaduke","gif"); } sub getMeatloafNight { &getUnitedMedia("Meatloaf Night","comics/meatloaf","gif"); } sub getMeg { &getUnitedMedia("Meg!","comics/meg","gif"); } sub getModeratelyConfused { &getUnitedMedia("Moderately Confused","comics/moderatelyconfused","gif"); } sub getMomma { &getUnitedMedia("Momma","creators/momma","gif"); } sub getMonty { &getUnitedMedia("Monty","comics/monty","gif"); } sub getMotley { &getUnitedMedia("Motley","comics/motley","gif"); } sub getNancy { &getUnitedMedia("Nancy","comics/nancy","gif"); } sub getNaturalSelection { &getUnitedMedia("Natural Selection","creators/naturalselection","gif"); } sub getOffTheMark { &getUnitedMedia("Off The Mark","comics/offthemark","gif"); } sub getOneBigHappy { &getUnitedMedia("One Big Happy","creators/onebighappy","gif"); } sub getTheOtherCoast { &getUnitedMedia("The Other Coast","creators/othercoast","gif"); } sub getOutOfTheGenePool { &getUnitedMedia("Out of the Gene Pool","wash/genepool","gif"); } sub getOverTheHedge { &getUnitedMedia("Over the Hedge","comics/hedge","gif"); } sub getPCAndPixel { &getUnitedMedia("PC and Pixel","wash/pcnpixel","gif"); } sub getPeanuts { &getUnitedMedia("Peanuts","comics/peanuts","gif"); } sub getPearslBeforeSwine { &getUnitedMedia("Pearls Before Swine","comics/pearls","gif"); } sub getPibgorn { &getUnitedMedia("Pibgorn","comics/pibgorn","jpg"); } sub getPickles { &getUnitedMedia("Pickles","wash/pickles","gif"); } sub getPotluckParish { &getUnitedMedia("Potluck Parish","comics/potluck","gif"); } sub getRaisingDuncan { &getUnitedMedia("Raising Duncan","comics/raisingduncan","gif"); } sub getRealityCheck { &getUnitedMedia("Reality Check","comics/reality","gif"); } sub getRedAndRover { &getUnitedMedia("Red & Rover","wash/redandrover","gif"); } sub getRipleys { &getUnitedMedia("Ripley's Believe It or Not!","comics/ripleys","gif"); } sub getRoseIsRose { &getUnitedMedia("Rose Is Rose","comics/roseisrose","gif"); } sub getRubes { &getUnitedMedia("Rubes","creators/rubes","gif"); } sub getRudyPark { &getUnitedMedia("Rudy Park","comics/rudypark","gif"); } sub getSheldon { &getUnitedMedia("Sheldon","comics/sheldon","gif"); } sub getShirleyAndSon { &getUnitedMedia("Shirly And Son","comics/shirleynson","gif"); } sub getSoupToNutz { &getUnitedMedia("Soup To Nutz","comics/soup2nutz","gif"); } sub getSpeedBump { &getUnitedMedia("Speed Bump","creators/speedbump","gif"); } sub getStockcarToons { &getUnitedMedia("StockcarToons","comics/stockcartoons","gif"); } sub getStrangeBrew { &getUnitedMedia("Strange Brew","creators/strangebrew","gif"); } sub getTarzan { &getUnitedMedia("Tarzan","comics/tarzan","gif"); } sub getThatsLife { &getUnitedMedia("That's Life","wash/thatslife","gif"); } sub getTopOfTheWorld { &getUnitedMedia("Top of the World!","comics/topofworld","gif"); } sub getWizardOfId { &getUnitedMedia("Wizard of Id","creators/wizardofid","gif"); } sub getWorkingDaze { &getUnitedMedia("Working Daze","comics/workingdaze","gif"); } # editorial cartoons sub getRobertAriail { &getUnitedMedia("Robert Ariail","editoons/ariail ","gif"); } sub getChuckAsay { &getUnitedMedia("Chuck Asay","editoons/asay","gif"); } sub getSteveBenson { &getUnitedMedia("Steve Benson","editoons/benson","gif"); } sub getRandyBish { &getUnitedMedia("Randy Bish","editoons/bish","jpg"); } sub getChipBok { &getUnitedMedia("Chip Bok","editoons/bok","gif"); } sub getBillDay { &getUnitedMedia("Bill Day","editoons/day","gif"); } sub getJerryHolbert { &getUnitedMedia("Jerry Holbert","editoons/holbert","gif"); } sub getEttaHulme { &getUnitedMedia("Etta Hulme","editoons/hulme","gif"); } sub getDrewLitton { &getUnitedMedia("Draw Litton","editoons/litton","gif"); } sub getMikeLuckovich { &getUnitedMedia("Mike Luckovich","editoons/luckovich","gif"); } sub getHenryPayne { &getUnitedMedia("Henry Payne","editoons/payne","gif"); } sub getRobRogers { &getUnitedMedia("Rob Rogers","editoons/rogers","gif"); } sub getBillSchorr { &getUnitedMedia("Bill Schorr","editoons/schorr","gif"); } sub getMikeSmith { &getUnitedMedia("Mike Smith","editoons/smith","gif"); } sub getJeffStahler { &getUnitedMedia("Jeff Stahler","editoons/stahler","gif"); } sub getEdStein { &getUnitedMedia("Ed Stein","editoons/stein","gif"); } sub getPaulSzep { &getUnitedMedia("Paul Szep","editoons/szep","gif"); } sub getGaryVarvel { &getUnitedMedia("Gary Varvel","editoons/varvel","gif"); } # # Disparate comics # # Dilbert has a slightly different archive, even though it's United Media, like above sub getDilbert { &getItem("Dilbert", "http://www.unitedmedia.com/comics/dilbert/", "src=.(\/comics\/dilbert\/archive\/images.*?gif)", "

", 0); } sub getClayBennett { &getItem("Clay Bennett", "http://www.christiansciencemonitor.com/commentary/index.html", "today.s cartoon.*?img.src=.(\/.*?csmimg.cartoon.jpg)", "

", 0); } sub getDorkTower { &getItem("Dork Tower", "http://www.gamespy.com/comics/dorktower/", "src=.(images\/comics\/DorkTower.*?jpg)", "

", 0); } sub getStoryMinute { &getItem("Story Minute", "http://www.waylay.com/Store/OrigPages/Originals.html", "src=\"(\.\.\/\.\.\/Images\/Strips\/.*?gif)", "

", 0); } sub getDTWOF { &getItem("Dykes To Watch Out For", "http://www.planetout.com/entertainment/comics/splash.html", "c:\/entertainment\/comics\/dtwof\/archive\/(.*?)html", "

", 0); } # Doesn't work: the php grab comes from off-site, so they send a "no image available" GIF instead. Clever. sub getBizarro { # &getItem("Bizarro", # "http://www.kingfeatures.com/features/comics/bizarro/aboutMaina.php", # "(date=.*?)'", # "

", 0); printf WEBFILE "\n"; printf WEBFILE "
\n"; printf WEBFILE "Bizarro
\n"; printf WEBFILE "

Sorry, you've got to click this link.

\n"; } sub getThisModernWorld { my $urlstring; $urlstring = &getStringFromPage("http://www.workingforchange.com/column_lst.cfm?AuthrId=43", "itemid=(\\d+)\""); # better, but can't quite quote it right: "article.cfm?itemid=(.*?)\&"); &getItem("This Modern World", "http://www.workingforchange.com/article.cfm?itemid=" . $urlstring, "(http:\/\/workingforchange.speedera.net\/www.workingforchange.com\/webgraphics\/wfc\/TMW.*?gif)\"", "

", 0); } sub getTroubletown { &getItem("Troubletown", "http://www.troubletown.com/cartoons/index.html", "SRC=\"(cartoons/ttown.*?gif)", "

", 0); } sub getTheKChronicles { my $grabbit; $grabbit = &getStringFromPage("http://www.buzzle.com/chapters/escape-hatch_cartoons-and-comics.asp", "a href=\"..(/editorials/.*?asp)\">the K Chronicles"); &getItem("The K Chronicles", "http://www.buzzle.com" . $grabbit, "\.\.(/pix/articleImages/.*?jpg)", "

", 0); } sub getNodwick { &getItem("Nodwick", "http://www.gamespy.com/comics/nodwick/gamespyarchive/newnodwick.html", "src=.(Nodwick1.*?jpg)", "

", 0); } # this is an easy special case - just point to the URL sub getKirkAnderson { &makeBanner("Kirk Anderson"); print WEBFILE "

\n"; print WEBFILE "\n"; print WEBFILE "

\n"; } # The talk shows are all slightly different and idiosyncratic sub getCharlieRose { &getItem("Charlie Rose", "http://www.charlierose.com/index.shtm", "producers.*()", "", "", 1); } sub getJonStewart { &getItem("Jon Stewart", "http://www.comedycentral.com/tv_shows/thedailyshowwithjonstewart/", "TONIGHT:(.*?)<", "

", "

", 1); } sub getJayLeno { &getItem("Jay Leno", "http://www.nbc.com/nbc/The_Tonight_Show_with_Jay_Leno/", "whitebold..(.*?)..span", "

", "

", 1); } sub getDavidLetterman { &getItem("David Letterman", "http://www.cbs.com/latenight/lateshow/", "Tonight.s Guests.*?:<\/b>(.*?)
", "

", "

", 1); } sub getConanOBrien { &getItem("Conan O\'Brien", "http://www.nbc.com/Late_Night_with_Conan_O'Brien/index.html", "Tonight Conan welcomes (.*?)", "

", "

", 1); } sub getCraigKilborn { &getItem("Craig Kilborn", "http://www.cbs.com/latenight/latelate/", "(.*?)<", "

", "

", 1); } ########################################### ### Subroutines ########################################### # # The basic routine that drives the others. # Open up the web file, fill it with content, and close it up # sub makeWebPage { ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(); $dayname = ("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")[$wday-1]; $monthName = ("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")[$mon]; $fullYear = $year+1900; open (WEBFILE, ">$webFile") || die "Error opening web htm file $webFile\n"; print WEBFILE "\n\n\n"; print WEBFILE "$myName\'s Custom Webpage\n\n"; print WEBFILE "\n"; print WEBFILE "

$myName\'s Custom Webpage

\n"; print WEBFILE "

$dayname, $monthName $mday, $fullYear

\n"; &makeUserAgent; &getWebContent; &makeBanner(" "); print WEBFILE "\n<\html>\n"; close WEBFILE; } # # Create the user agent so we can fetch web pages # sub makeUserAgent { $ua = LWP::UserAgent->new; } # # Make a horizontal banner to identify the content. # Parameters: bannerTitle....the string to go in the banner # sub makeBanner { ($bannerTitle) = @_; print WEBFILE "\n"; print WEBFILE "
\n"; print WEBFILE "$bannerTitle
\n"; } # # A little convenience proc for those comics provided by United Comics # Parameters: # name.......The name of the cartoonist # dir........The UC directory for that person's webpage # inits......The cartoonist's initials used by UC to find the image # sub getUnitedComic { ($name, $dir, $inits) = @_; $reqURL = "http://www.ucomics.com/$dir/index.phtml"; $pattern = "img src=.(http:..images.ucomics.com.comics.$inits.*?gif)"; &getItem($name, $reqURL, $pattern, "

", 0); } # united media has lots, too sub getUnitedMedia { ($name, $dir, $ext) = @_; &getItem($name, "http://www.unitedmedia.com/".$dir."/", "src=.(/".$dir."\/archive\/images.*?".$ext.")", "

", 0); } ########################################### # # Subroutine getItem is the heart of the program. It goes off # and fetches a web page, extracts the pattern match, removes # links, optionally removes embedded images, and then writes # the extracted pattern out to the custom web page with the # given prefix and suffix. # # Try to avoid using explicit double quotes in the pattern. # Use . in their place instead - it's less confusing. # # parameters: # title..........The string that appears in the banner on the web page # reqURL.........The URL with the content we want to extract # pattern........Specifies which content to extract. This is a perl # regular expression. It will be interpreted in a # case-insensitive way. The actual content to be # placed on the web page should be enclosed in round # parentheses, e.g. "stuff(get this)stuff". Note as # above that you should avoid using double-quotes in # this string, because they're confusing while trying # to debug the pattern. # prefix.........This gets placed in front of the material found by the # pattern when it's written to the web page. # suffix.........This comes after the matched material. # removeImages...Set this to 1 for text-only extractions, like talk # shows and TV listings. Set to 0 when you want to # retain pictures, like comics. # ########################################### sub getItem { # unpack the procedure arguments ($title, $reqURL, $pattern, $prefix, $suffix, $removeImages) = @_; # get the contents of the web page named by the URL my $insstring; $insstring = &getStringFromPage( $reqURL, $pattern ); if ($insstring ne "") { $bannerTitle = $title; &makeBanner; print WEBFILE "$prefix$insstring$suffix\n"; } } # # A generally handy subroutine: pulls in a web page, searches for a string, # returns the matched string if found, else returns nothing and warns the user. # sub getStringFromPage { # unpack the procedure arguments ($reqURL, $pattern) = @_; my $srcline; my $grabbedString; $grabbedString = ""; # get the contents of the web page named by the URL $request = HTTP::Request->new(GET => $reqURL); $answer = $ua->request($request); if ($answer->is_success) { $plaintext = $answer->as_string; $nobreak = $plaintext; $nobreak=~s/\n/ /g; # remove newlines $nobreak=~s/\r/ /g; # remove carriage returns $nobreak=~/$pattern/i; # now match the pattern $srcline = $1; # srcline holds what was in the parens # remove any hrefs (the refs might be local to the site) $srcline=~s///ig; if ($removeImages != 0) { $srcline=~s///ig; } $srcline=~s/ +/ /g; # reduce big chunks of blanks to singles if ($srcline ne "") { $grabbedString = $srcline; } else { printf "ERROR: no pattern \"$pattern\" found in $reqURL\n"; } } return $grabbedString; }