In this lesson we learn about basic principles of the XML and ways of manipulating data in this format.
Python code (for 1a)
For our first task we should write a python script that will create clean copies of text from each issue of the “Dispatch” that we scraped before.
import re
import os
source = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson07/wget-activehistory/" # Source path where initial xml files are.
target = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson_08/wget-activehistory_modified/" # Target folder path to save new files.
listOfFiles = os.listdir(source)
for f in listOfFiles: # looping our files
with open(source+f, "r", encoding="utf8") as f1:
# read data
data = f1.read()
# removes markup from each file (articles)
text = re.sub("<[^<]+>","", data)
# rename file and create a new folder
newFile = target + f + "_modified.xml"
with open(newFile, "w", encoding="utf8") as f9:
# write text
f9.write(text)
Python code (for 1b)
Our second task is to write a python script that will create clean copies of articles from all issues of the “Dispatch”(wget-activehistory).
import re, os
source = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson07/wget-activehistory/" # Source path where initial xml files are.
target = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson_08/wget-activehistory_modified/" # Target folder path to save new files.
lof = os.listdir(source) # getting all files from folder
counter = 0 # general counter to keep track of the progress
for f in lof: # looping our files
if f.startswith("dltext"): # fileName test
with open(source + f, "r", encoding="utf8") as f1:
text = f1.read()
# try to find the date
date = re.search(r'<date value="([\d-]+)"', text).group(1)
# splitting the issue into articles/items (with regular expression we want to unify links so that we can use it)
split = re.split("<div3 ", text)
c = 0 # item counter
for s in split[1:]: # in order to split
c += 1
s = "<div3 " + s # a step to restore the integrity of items
#input(s)
# try to find a unitType
try:
unitType = re.search(r'type="([^\"]+)"', s).group(1)
except:
unitType = "noType"
print(s)
# try to find a header
try:
header = re.search(r'<head>(.*)</head>', s).group(1)
header = re.sub("<[^<]+>", "", header)
except:
header = "NO HEADER"
print("\nNo header found!\n")
text = re.sub("<[^<]+>", "", s)
text = re.sub(" +\n|\n +", "\n", text)
text = re.sub("\n+", ";;; ", text)
# generating necessary bits
fName = date+"_"+unitType+"_"+str(c)
itemID = "#ID: " + date+"_"+unitType+"_"+str(c)
dateVar = "#DATE: " + date
unitType = "#TYPE: " + unitType
header = "#HEADER: " + header
text = "#TEXT: " + text
# creating a text variable
var = "\n".join([itemID,dateVar,unitType,header,text])
#input(var)
# oping and saving a file
with open(target+fName+".txt", "w", encoding="utf8") as f9: #
f9.write(var)
# count processed issues and print progress counter at every 100
counter += 1
if counter % 100 == 0:
print(counter)