In this lesson we should reformatte Dispatch (wget-activehistory) file in order to generate one TSV - file in which each article is a single record with following content:
- date;
- type of entry (articles, notices etc);
- header;
- the text of entry.
Homework Solution
import re, os
source = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson07/wget-activehistory/"
target = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson_09/"
lof = os.listdir(source)
counter = 0 # general counter to keep track of the progress
ourCSV = [] # the list to hold the whole dictionaries
for f in lof:
if f.startswith("dltext"): # fileName test
with open(source + f, "r", encoding="utf8") as f1:
text = f1.read()
# try to find the date
date = re.search(r'<date value="([\d-]+)"', text).group(1)
# splitting the issue into articles/items
split = re.split("<div3 ", text)
c = 0 # item counter
for s in split[1:]:
c += 1
s = "<div3 " + s # a step to restore the integrity of items
#input(s)
# try to find a unitType
try:
unitType = re.search(r'type="([^\"]+)"', s).group(1)
except:
unitType = "noType"
print(s)
# try to find a header, if not write "NO HEADER"(should not be in our list)
try:
header = re.search(r'<head>(.*)</head>', s).group(1)
header = re.sub("<[^<]+>", "", header)
except:
header = "NO HEADER"
#print("No header found!")
# to clean our file from all tags (which do not need).
text = re.sub("<[^<]+>", "", s)
text = re.sub(" +\n|\n +", "\n", text)
text = re.sub("\n+", ";;; ", text)
# generating necessary bits
fName = date+"_"+unitType+"_"+str(c)
# creating all the needed entries
itemID = date+"_"+unitType+"_"+str(c)
dateVar = date
#unitType = unitType
#header = header
text = text.replace("\t", " ") # text replace
# creating a text variable
var = "\t".join([itemID,dateVar,unitType,header,text])
#input(var)
ourCSV.append(var)
# count processed issues and print progress counter at every 100
counter += 1
if counter % 100 == 0:
print(counter)
# saving
with open("dispatch_as_TSV.csv", "w", encoding="utf8") as f9:
f9.write("\n".join(ourCSV))
print(counter)
Results:
Please find below my old version of the Python code with explanation:
For much better view all steps please follow this link: Lesson09!