L08 Text Markup (XML)

In this lesson we learn about basic principles of the XML and ways of manipulating data in this format.

Python code (for 1a)

For our first task we should write a python script that will create clean copies of text from each issue of the “Dispatch” that we scraped before.

import re
import os

source = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson07/wget-activehistory/" # Source path where initial xml files are.
target = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson_08/wget-activehistory_modified/" # Target folder path to save new files.


listOfFiles = os.listdir(source) 

for f in listOfFiles: # looping our files
  with open(source+f, "r", encoding="utf8") as f1:
      # read data
      data = f1.read()
      # removes markup from each file (articles)
      text = re.sub("<[^<]+>","", data)
      # rename file and create a new folder
      newFile =  target + f + "_modified.xml"

      with open(newFile, "w", encoding="utf8") as f9:
          # write text
          f9.write(text)

Python code (for 1b)

Our second task is to write a python script that will create clean copies of articles from all issues of the “Dispatch”(wget-activehistory).

import re, os

source = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson07/wget-activehistory/" # Source path where initial xml files are.
target = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson_08/wget-activehistory_modified/" # Target folder path to save new files.

lof = os.listdir(source) # getting all files from folder
counter = 0 # general counter to keep track of the progress

for f in lof: # looping our files
    if f.startswith("dltext"): # fileName test        
        with open(source + f, "r", encoding="utf8") as f1:
            text = f1.read()

            # try to find the date 
            date = re.search(r'<date value="([\d-]+)"', text).group(1)

            # splitting the issue into articles/items (with regular expression we want to unify links so that we can use it)
            split = re.split("<div3 ", text)

            c = 0 # item counter
            for s in split[1:]: # in order to split 
                c += 1
                s = "<div3 " + s # a step to restore the integrity of items
                #input(s)

                # try to find a unitType
                try:
                    unitType = re.search(r'type="([^\"]+)"', s).group(1)
                except:
                    unitType = "noType"
                    print(s)

                # try to find a header                  
                try:
                    header = re.search(r'<head>(.*)</head>', s).group(1)
                    header = re.sub("<[^<]+>", "", header)
                except:
                    header = "NO HEADER"
                    print("\nNo header found!\n")

                text = re.sub("<[^<]+>", "", s)
                text = re.sub(" +\n|\n +", "\n", text)
                text = re.sub("\n+", ";;; ", text)

                # generating necessary bits 
                fName = date+"_"+unitType+"_"+str(c)

                itemID = "#ID: " + date+"_"+unitType+"_"+str(c)
                dateVar   = "#DATE: " + date
                unitType = "#TYPE: " + unitType
                header = "#HEADER: " + header
                text = "#TEXT: " + text

                # creating a text variable
                var = "\n".join([itemID,dateVar,unitType,header,text])
                #input(var)

                # oping and saving a file
                with open(target+fName+".txt", "w", encoding="utf8") as f9: #
                    f9.write(var)

        # count processed issues and print progress counter at every 100        
        counter += 1
        if counter % 100 == 0:
            print(counter)