{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Download data from Wikipedia","metadata":{}},{"cell_type":"code","source":"# install wikipedia API python wrapper\n! pip install wikipedia","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2022-08-20T21:43:59.293655Z","iopub.execute_input":"2022-08-20T21:43:59.294792Z","iopub.status.idle":"2022-08-20T21:44:15.263363Z","shell.execute_reply.started":"2022-08-20T21:43:59.294746Z","shell.execute_reply":"2022-08-20T21:44:15.262171Z"},"trusted":true},"execution_count":3,"outputs":[]},{"cell_type":"code","source":"import wikipedia\nimport json\nimport traceback","metadata":{"execution":{"iopub.status.busy":"2022-08-20T21:44:15.265341Z","iopub.execute_input":"2022-08-20T21:44:15.265753Z","iopub.status.idle":"2022-08-20T21:44:15.470330Z","shell.execute_reply.started":"2022-08-20T21:44:15.265709Z","shell.execute_reply":"2022-08-20T21:44:15.468665Z"},"trusted":true},"execution_count":4,"outputs":[]},{"cell_type":"code","source":"# titles to download, from https://en.wikipedia.org/wiki/List_of_mainstream_rock_performers\n\npages_titles=\"\"\"10cc\n10_Years_(band)\n3_Doors_Down\n311_(band)\n38_Special_(band)\nAccept_(band)\nAC/DC\nBryan_Adams\nAerosmith\nAFI_(band)\nAir_Supply\nThe_Alan_Parsons_Project\nAlice_in_Chains\nThe_All-American_Rejects\nThe_Allman_Brothers_Band\nAlter_Bridge\nAmbrosia_(band)\nAmerica_(band)\nThe_Animals\nAdam_Ant\nAnthrax_(American_band)\nApril_Wine\nArcade_Fire\nArctic_Monkeys\nAsia_(band)\nAudioslave\nAvenged_Sevenfold\nAwolnation\nThe_B-52's\nBachman–Turner_Overdrive\nBad_Company\nBadfinger\nThe_Band\nThe_Bangles\nBarenaked_Ladies\nBay_City_Rollers\nThe_Beach_Boys\nThe_Beatles\nBeck\nBen_Folds_Five\nPat_Benatar\nChuck_Berry\nThe_Big_Bopper\nBilly_Talent\nThe_Black_Crowes\nThe_Black_Keys\nBlack_Sabbath\nBlack_Stone_Cherry\nBlack_Veil_Brides\nBlink-182\nBloodhound_Gang\nBlue_October\nBlue_Öyster_Cult\nBlues_Traveler\nJames_Blunt\nBlur_(band)\nBon_Jovi\nBoston_(band)\nDavid_Bowie\nBowling_for_Soup\nBoys_Like_Girls\nBread_(band)\nBreaking_Benjamin\nBring_Me_the_Horizon\nJackson_Browne\nBuckcherry\nJeff_Buckley\nBullet_for_My_Valentine\nBush_(British_band)\nThe_Byrds\nCage_the_Elephant\nCake_(band)\nCanned_Heat\nThe_Cab\nThe_Cardigans\nThe_Cars\nCatfish_and_the_Bottlemen\nHarry_Chapin\nTracy_Chapman\nCheap_Trick\nChevelle_(band)\nChicago_(band)\nChubby_Checker\nCinderella_(band)\nDallas_Green_(musician)\nEric_Clapton\nThe_Clash\nEddie_Cochran\nJoe_Cocker\nCoheed_and_Cambria\nCold_Chisel\nColdplay\nCollective_Soul\nPhil_Collins\nAlice_Cooper\nChris_Cornell\nElvis_Costello\nCounting_Crows\nThe_Cranberries\nCrash_Test_Dummies\nCream_(band)\nCreed_(band)\nCreedence_Clearwater_Revival\nJim_Croce\nCrosby,_Stills,_Nash_&_Young\nChristopher_Cross\nSheryl_Crow\nCrowded_House\nThe_Cult\nThe_Cure\nDamn_Yankees_(band)\nDashboard_Confessional\nDaughtry_(band)\nThe_Dave_Clark_Five\nDave_Matthews_Band\nDays_of_the_New\nDeath_Cab_for_Cutie\nDeep_Purple\nDef_Leppard\nDeftones\nDepeche_Mode\nBo_Diddley\nDio_(band)\nDire_Straits\nDisturbed_(band)\nFats_Domino\nDonovan\nThe_Doobie_Brothers\nThe_Doors\nDr._Hook_&_the_Medicine_Show\nDropkick_Murphys\nDrowning_Pool\nDuran_Duran\nIan_Dury\nBob_Dylan\nEagles_(band)\nEcho_&_the_Bunnymen\nDuane_Eddy\nEdgar_Winter\nElectric_Light_Orchestra\nEmerson,_Lake_&_Palmer\nEngland_Dan_&_John_Ford_Coley\nMelissa_Etheridge\nEurope_(band)\nEvanescence\nEverclear_(band)\nEverlast\nThe_Everly_Brothers\nExtreme_(band)\nFaces_(band)\nFaith_No_More\nFall_Out_Boy\nBryan_Ferry\nFilter_(band)\nFinger_Eleven\nFireHouse\nFive_Finger_Death_Punch\nFive_for_Fighting\nThe_Fixx\nThe_Flaming_Lips\nFleetwood_Mac\nFlogging_Molly\nFlorence_and_the_Machine\nFlyleaf_(band)\nFoals_(band)\nDan_Fogelberg\nJohn_Fogerty\nFoo_Fighters\nForeigner_(band)\nFoster_the_People\nThe_Four_Seasons_(band)\nPeter_Frampton\nFranz_Ferdinand_(band)\nThe_Fray\nGlenn_Frey\nFuel_(band)\nFun_(band)\nPeter_Gabriel\nGarbage_(band)\nGenesis_(band)\nGhost_(Swedish_band)\nGin_Blossoms\nGary_Glitter\nThe_Go-Go's\nGodsmack\nGolden_Earring\nGoo_Goo_Dolls\nGood_Charlotte\nGrand_Funk_Railroad\nGrateful_Dead\nGreat_White\nGreen_Day\nGreta_Van_Fleet\nThe_Guess_Who\nGuns_N'_Roses\nHalestorm\nBill_Haley_&_His_Comets\nHall_&_Oates\nGeorge_Harrison\nHeart_(band)\nJimi_Hendrix\nDon_Henley\nHerman's_Hermits\nHighly_Suspect\nHinder\nThe_Hives\nHole_(band)\nThe_Hollies\nBuddy_Holly\nHoobastank\nHootie_&_the_Blowfish\nIcehouse_(band)\nBilly_Idol\nImagine_Dragons\nIncubus_(band)\nInterpol_(band)\nINXS\nIron_Maiden\nThe_J._Geils_Band\nThe_Jam\nTommy_James_and_the_Shondells\nJane's_Addiction\nJefferson_Airplane\nJefferson_Starship\nThe_Jesus_and_Mary_Chain\nJet_(Australian_band)\nJethro_Tull_(band)\nJoan_Jett\nJimmy_Eat_World\nBilly_Joel\nElton_John\nJanis_Joplin\nJourney_(band)\nJoy_Division\nJudas_Priest\nKaiser_Chiefs\nKaleo_(band)\nKansas_(band)\nKeane_(band)\nKid_Rock\nThe_Killers\nKillswitch_Engage\nKings_of_Leon\nThe_Kinks\nKiss_(band)\nKorn\nLenny_Kravitz\nLacuna_Coil\nLamb_of_God_(band)\nAvril_Lavigne\nLed_Zeppelin\nJohn_Lennon\nHuey_Lewis_and_the_News\nJerry_Lee_Lewis\nLifehouse_(band)\nLimp_Bizkit\nLinkin_Park\nLittle_Richard\nLittle_River_Band\nLive_(band)\nLiving_Colour\nKenny_Loggins\nLoverboy\nThe_Lovin'_Spoonful\nThe_Lumineers\nLynyrd_Skynyrd\nThe_Mamas_&_the_Papas\nMarilyn_Manson\nThe_Marshall_Tucker_Band\nMatchbox_Twenty\nJohn_Mayer\nPaul_McCartney\nMeat_Loaf\nMegadeth\nJohn_Mellencamp\nMen_at_Work\nMetallica\nMidnight_Oil\nMike_and_the_Mechanics\nModest_Mouse\nEddie_Money\nThe_Monkees\nThe_Moody_Blues\nAlanis_Morissette\nVan_Morrison\nMorrissey\nMötley_Crüe\nMotörhead\nMudvayne\nMumford_&_Sons\nMuse_(band)\nMy_Chemical_Romance\nNickelback\nStevie_Nicks\nHarry_Nilsson\nNine_Inch_Nails\nNirvana_(band)\nNo_Doubt\nTed_Nugent\nOasis_(band)\nThe_Offspring\nRoy_Orbison\nOzzy_Osbourne\nOur_Lady_Peace\nThe_Outfield\nP.O.D.\nPanic!_at_the_Disco\nPantera\nPapa_Roach\nParamore\nPearl_Jam\nA_Perfect_Circle\nTom_Petty_and_the_Heartbreakers\nPink_Floyd\nPixies_(band)\nRobert_Plant\nPoison_(American_band)\nThe_Police\nIggy_Pop\nPop_Evil\nThe_Presidents_of_the_United_States_of_America_(band)\nThe_Pretenders\nElvis_Presley\nThe_Pretty_Reckless\nPrimus_(band)\nPuddle_of_Mudd\nQueen_(band)\nQueens_of_the_Stone_Age\nQueensrÿche\nQuiet_Riot\nR.E.M.\nRadiohead\nRage_Against_the_Machine\nRainbow_(rock_band)\nRammstein\nRamones\nRed_Hot_Chili_Peppers\nLou_Reed\nREO_Speedwagon\nRise_Against\nThe_Rolling_Stones\nLinda_Ronstadt\nRoxy_Music\nRoyal_Blood_(band)\nRush_(band)\nSaliva_(band)\nSam_Fender\nSantana_(band)\nJoe_Satriani\nSaving_Abel\nScorpions_(band)\nThe_Script\nSeether\nBob_Seger\nSepultura\nSex_Pistols\nShakin'_Stevens\nShinedown\nSilverchair\nSimon_&_Garfunkel\nSimple_Minds\nSimple_Plan\nSkid_Row_(American_band)\nSkillet_(band)\nSlade\nSlayer\nSlipknot_(band)\nSmall_Faces\nSmash_Mouth\nThe_Smashing_Pumpkins\nThe_Smiths\nSmokie_(band)\nSnow_Patrol\nSocial_Distortion\nSoundgarden\nBruce_Springsteen\nBilly_Squier\nStaind\nRingo_Starr\nStarset\nStarship_(band)\nStatus_Quo_(band)\nSteely_Dan\nSteppenwolf_(band)\nSteve_Miller_Band\nRod_Stewart\nSting_(musician)\nThe_Stone_Roses\nStone_Sour\nStone_Temple_Pilots\nThe_Strokes\nStyx_(band)\nSublime_(band)\nSum_41\nSupertramp\nSurvivor_(band)\nThe_Sweet\nSystem_of_a_Down\nT._Rex_(band)\nTalking_Heads\nJames_Taylor\nTenacious_D\nTesla_(band)\nTheory_of_a_Deadman\nThin_Lizzy\nThird_Eye_Blind\nThirty_Seconds_to_Mars\nGeorge_Thorogood\nThousand_Foot_Krutch\nThree_Days_Grace\nThree_Dog_Night\nTool_(band)\nToto_(band)\nTraffic_(band)\nThe_Tragically_Hip\nTrain_(band)\nTraveling_Wilburys\nTravis_(band)\nTrivium_(band)\nTwenty_One_Pilots\nTwisted_Sister\nU2\nUriah_Heep_(band)\nThe_Used\nSteve_Vai\nRitchie_Valens\nVampire_Weekend\nVan_Halen\nStevie_Ray_Vaughan\nVelvet_Revolver\nThe_Velvet_Underground\nThe_Verve\nVolbeat\nJoe_Walsh\nWarrant_(American_band)\nWeezer\nJack_White\nThe_White_Stripes\nWhite_Zombie_(band)\nWhitesnake\nThe_Who\nPaul_McCartney_and_Wings\nSteve_Winwood\nThe_Yardbirds\nYes_(band)\nNeil_Young\nFrank_Zappa\nRob_Zombie\nThe_Zombies\nZZ_Top\"\"\".split('\\n')","metadata":{"execution":{"iopub.status.busy":"2022-08-20T23:34:24.681697Z","iopub.execute_input":"2022-08-20T23:34:24.682223Z","iopub.status.idle":"2022-08-20T23:34:24.693942Z","shell.execute_reply.started":"2022-08-20T23:34:24.682178Z","shell.execute_reply":"2022-08-20T23:34:24.693004Z"},"trusted":true},"execution_count":54,"outputs":[]},{"cell_type":"code","source":"for i,raw_title in enumerate(pages_titles):\n if i%10==0:\n print(i/len(pages_titles)*100)\n try:\n page=wikipedia.page(title=raw_title.replace('_', ' '), auto_suggest=False)\n id_ = page.pageid\n url= page.url\n dic={'content': page.content,\n 'meta':{'name': page.title,\n 'url': url}}\n\n \n with open(f'/kaggle/working/rock_wiki/{id_}.json','w') as fo:\n json.dump(dic, fo)\n except Exception as e:\n traceback.print_exc()\n print(raw_title)\n ","metadata":{"execution":{"iopub.status.busy":"2022-08-20T23:34:49.157641Z","iopub.execute_input":"2022-08-20T23:34:49.158086Z","iopub.status.idle":"2022-08-20T23:44:29.346317Z","shell.execute_reply.started":"2022-08-20T23:34:49.158047Z","shell.execute_reply":"2022-08-20T23:44:29.345032Z"},"trusted":true},"execution_count":57,"outputs":[]},{"cell_type":"code","source":"! tar -czvf rock_wiki.tar.gz ./rock_wiki","metadata":{"execution":{"iopub.status.busy":"2022-08-20T23:50:44.643851Z","iopub.execute_input":"2022-08-20T23:50:44.644378Z","iopub.status.idle":"2022-08-20T23:50:44.650366Z","shell.execute_reply.started":"2022-08-20T23:50:44.644328Z","shell.execute_reply":"2022-08-20T23:50:44.649169Z"},"trusted":true},"execution_count":60,"outputs":[]}]}