In this lesson we learn about basic principles of the XML and ways of manipulating data in this format.

Python code (for 1a)

For our first task we should write a python script that will create clean copies of text from each issue of the “Dispatch” that we scraped before.

import re
import os

source = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson07/wget-activehistory/" # Source path where initial xml files are.
target = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson_08/wget-activehistory_modified/" # Target folder path to save new files.


listOfFiles = os.listdir(source) 

for f in listOfFiles: # looping our files
  with open(source+f, "r", encoding="utf8") as f1:
      # read data
      data = f1.read()
      # removes markup from each file (articles)
      text = re.sub("<[^<]+>","", data)
      # rename file and create a new folder
      newFile =  target + f + "_modified.xml"

      with open(newFile, "w", encoding="utf8") as f9:
          # write text
          f9.write(text)

Python code (for 1b)

Our second task is to write a python script that will create clean copies of articles from all issues of the “Dispatch”(wget-activehistory).

import re, os

source = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson07/wget-activehistory/" # Source path where initial xml files are.
target = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson_08/wget-activehistory_modified/" # Target folder path to save new files.

lof = os.listdir(source) # getting all files from folder
counter = 0 # general counter to keep track of the progress

for f in lof: # looping our files
    if f.startswith("dltext"): # fileName test        
        with open(source + f, "r", encoding="utf8") as f1:
            text = f1.read()

            # try to find the date 
            date = re.search(r'<date value="([\d-]+)"', text).group(1)

            # splitting the issue into articles/items (with regular expression we want to unify links so that we can use it)
            split = re.split("<div3 ", text)

            c = 0 # item counter
            for s in split[1:]: # in order to split 
                c += 1
                s = "<div3 " + s # a step to restore the integrity of items
                #input(s)

                # try to find a unitType
                try:
                    unitType = re.search(r'type="([^\"]+)"', s).group(1)
                except:
                    unitType = "noType"
                    print(s)

                # try to find a header                  
                try:
                    header = re.search(r'<head>(.*)</head>', s).group(1)
                    header = re.sub("<[^<]+>", "", header)
                except:
                    header = "NO HEADER"
                    print("\nNo header found!\n")

                text = re.sub("<[^<]+>", "", s)
                text = re.sub(" +\n|\n +", "\n", text)
                text = re.sub("\n+", ";;; ", text)

                # generating necessary bits 
                fName = date+"_"+unitType+"_"+str(c)

                itemID = "#ID: " + date+"_"+unitType+"_"+str(c)
                dateVar   = "#DATE: " + date
                unitType = "#TYPE: " + unitType
                header = "#HEADER: " + header
                text = "#TEXT: " + text

                # creating a text variable
                var = "\n".join([itemID,dateVar,unitType,header,text])
                #input(var)

                # oping and saving a file
                with open(target+fName+".txt", "w", encoding="utf8") as f9: #
                    f9.write(var)

        # count processed issues and print progress counter at every 100        
        counter += 1
        if counter % 100 == 0:
            print(counter)