In this lesson we should prepare data for our final mapping in QGIS
.
The preparation of data and final results have following steps:
1) Collecting all toponyms
2) Reformatting Getty Geographical Thesaurus (from XML)
3) Matching collected geographical data with Getty Gazetteer
4) Final results in QGIS
.
Solutions to Text to Map 1/1
Collecting all toponyms
import re, os
source = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson07/wget-activehistory/"
target = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson_10/"
lof = os.listdir(source)
counter = 0 # general counter to keep track of the progress
def generate(filter): function to generate the tsv, filter are the years of the dispatch
topCountDictionary = {} #creating dictionary
print(filter) # print filter which are the years of the dispatch
counter = 0 # general counter to keep track of the progress
for f in lof: # looping through our all XML files
if f.startswith("dltext"): # fileName test
with open(source + f, "r", encoding="utf8") as f1:# opening and reading the files
text = f1.read()
text = text.replace("&", "&")# cleaning the content of the file
# try to find the date
date = re.search(r'<date value="([\d-]+)"', text).group(1)
#print(date)
if date.startswith(filter): # making sure the output only includes data for the input year
for tg in re.findall(r"(tgn,\d+)", text):# finding the tgn info
tgn = tg.split(",")[1] # reducing the tgn informationen to the tgn number
# making a dictionary with tgn number as key and counting the frequency of its occurence:
if tgn in topCountDictionary:
topCountDictionary[tgn] += 1
else:
topCountDictionary[tgn] = 1
#input(topCountDictionary)
top_TSV = [] #creating a list to write a csv-file
# creating the list with our data:
for k,v in topCountDictionary.items():
val = "%09d\t%s" % (v, k) # defining the precise syntax for the csv file
#input(val)
top_TSV.append(val) # putting the information into the list
# saving the data
header = "freq\ttgn\n"
with open("dispatch_toponyms_%s.tsv" % filter, "w", encoding="utf8") as f9:
f9.write(header+"\n".join(top_TSV))
#print(counter)
#generate("186") and using our function for different years
generate("1861")
generate("1862")
generate("1863")
generate("1864")
generate("1865")
Results:
Reformating Getty Geographical Thesaurus (from XML)
import re, os
source = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson_10/tgn_xml_0619/"
def generateTGNdata(source): # function taking in the XML-files
lof = os.listdir(source) # creating a list out of the files
tgnList = []
tgnListNA = []
count = 0 #general counter to keep track of the progress
for f in lof:# looping through the list of files
if f.startswith("TGN"): # fileName test
print(f)
with open(source+f, "r", encoding="utf8") as f1: # opening and reading the files
data = f1.read()
data = re.split("</Subject>", data) # putting the content of the file into a list and splitting for every Subject
for d in data: # looping through the whole list of Subjects
d = re.sub("\n +", "", d)
#print(d)
if "Subject_ID" in d:
# SUBJECT ID
placeID = re.search(r"Subject_ID=\"(\d+)\"", d).group(1)
#print(placeID)
# NAME OF THE PLACE
placeName = re.search(r"<Term_Text>([^<]+)</Term_Text>", d).group(1)
#print(placeName)
# COORDINATES
if "<Coordinates>" in d:
latGr = re.search(r"<Latitude>(.*)</Latitude>", d).group(1)# finding latitude
lat = re.search(r"<Decimal>(.*)</Decimal>", latGr).group(1)# searching latitude in decimal
lonGr = re.search(r"<Longitude>(.*)</Longitude>", d).group(1)#finding longitude
lon = re.search(r"<Decimal>(.*)</Decimal>", lonGr).group(1)#searching longitude in decimal
#print(lat) # printing latitude
#print(lon) # printing longtude
else:
lat = "NA" # printing NA for latitude when this above is not available
lon = "NA" # printing NA for longitude when this above is not available
tgnList.append("\t".join([placeID, placeName, lat, lon])) #creating a list of all the places with coordinates
#input(tgnList)
if lat == "NA":# making a list of all the places without coordinates (should be printed, but we do not use for final mapping)
print("\t"+ "; ".join([placeID, placeName, lat, lon]))
tgnListNA.append("\t".join([placeID, placeName, lat, lon]))
# saving the files
header = "tgnID\tplacename\tlat\tlon\n"
with open("tgn_data_light.tsv", "w", encoding="utf8") as f9a:# saving the list with coordinates
f9a.write(header+"\n".join(tgnList))
with open("tgn_data_light_NA.tsv", "w", encoding="utf8") as f9b:# saving the list of places without coordinates
f9b.write(header+"\n".join(tgnListNA))
print("TGN has %d items" % len(tgnList))#print
generateTGNdata(source)
#TGN has 2,487,572 items
# 17,613 items do not have coordinates.
Results:
Matching collected geographical data with Getty Gazetteer
import re, os
# loading TGN data into a dictionary (with a function):
def loadTGN(tgnTSV): # defining the function
with open(tgnTSV, "r", encoding="utf8") as f1:# opening the previously created (and saved) tgnTSV-file
data = f1.read().split("\n")# creating a variable containing the data of the tgnTSV-file separated into lines
dic = {} #making the dict we will use
# looping through every line:
for d in data:
d = d.split("\t")# splitting the data with TABs
dic[d[0]] = d # filling the dict with the keys and values
return(dic)
# matching our data-files with the tgn-data:
def match(freqFile, dicToMatch):# defining the function and its intakes
with open(freqFile, "r", encoding="utf8") as f1:# opening the previously made and saved freqFile:
data = f1.read().split("\n")
# creating all the lists we need:
dataNew = []
dataNewNA = []
count = 0 # general counter to keep track of the progress
for d in data[1:]: # looping through our toponymes-file
tgnID = d.split("\t")[1]# getting the tgn-number so we can compare it to the one in the dict, start counting the columns with 0
freq = d.split("\t")[0] # getting the frequency so we can write it into a csv
if tgnID in dicToMatch: # matching places to the dict by tgn-number
val = "\t".join(dicToMatch[tgnID])# variable for the entry in the dict
val = val + "\t" + freq # putting the frequency and the timestamp to the entry in the dict
if "\tNA\t" in val: # creating lists with the toponymes without and with coordinates
dataNewNA.append(val)
else:
dataNew.append(val)
else:
print("%s (%d) not in TGN!" % (tgnID, int(freq)))
count += 1
# defining the names of the header:
header = "tgnID\tplacename\tlat\tlon\tfreq\n"
#creating a file with all the toponymes with their coordinates:
with open("coord_"+freqFile, "w", encoding="utf8") as f9a:
f9a.write(header + "\n".join(dataNew))
#creating a file with all the toponymes without coordinates:
with open("coord_NA_"+freqFile, "w", encoding="utf8") as f9b:
f9b.write(header + "\n".join(dataNewNA))
print("%d item have not been matched..." % count) # print all the items that could not been matched
dictionary = loadTGN("tgn_data_light.tsv") # variable for the Getty Gazetteer we previously cleaned up
# running and matching the two functions using our toponyms and the "tgn_data_light.tsv"
match("dispatch_toponyms_1861.tsv", dictionary)
match("dispatch_toponyms_1862.tsv", dictionary)
match("dispatch_toponyms_1863.tsv", dictionary)
match("dispatch_toponyms_1864.tsv", dictionary)
match("dispatch_toponyms_1865.tsv", dictionary)
Results:
Please find my old version of the Python code with explanation:
For much better view all steps please follow this link: Lesson10!