In this lesson we should prepare data for our final mapping in QGIS.

The preparation of data and final results have following steps: 1) Collecting all toponyms 2) Reformatting Getty Geographical Thesaurus (from XML) 3) Matching collected geographical data with Getty Gazetteer 4) Final results in QGIS.

Solutions to Text to Map 1/1

Collecting all toponyms

import re, os

source = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson07/wget-activehistory/"
target = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson_10/"


lof = os.listdir(source)
counter = 0 # general counter to keep track of the progress

def generate(filter): function to generate the tsv, filter are the years of the dispatch

    topCountDictionary = {} #creating dictionary 

    print(filter)  # print filter which are the years of the dispatch 
    counter = 0 # general counter to keep track of the progress
    for f in lof: # looping through our all XML files
        if f.startswith("dltext"): # fileName test        
            with open(source + f, "r", encoding="utf8") as f1:# opening and reading the files
                text = f1.read()

                text = text.replace("&", "&")# cleaning the content of the file

                # try to find the date
                date = re.search(r'<date value="([\d-]+)"', text).group(1)
                #print(date)

                if date.startswith(filter): # making sure the output only includes data for the input year
                    for tg in re.findall(r"(tgn,\d+)", text):# finding the tgn info
                        tgn = tg.split(",")[1] # reducing the tgn informationen to the tgn number
                    # making a dictionary with tgn number as key and counting the frequency of its occurence:
                        if tgn in topCountDictionary:
                            topCountDictionary[tgn] += 1
                        else:
                            topCountDictionary[tgn]  = 1

                        #input(topCountDictionary)
                    
    top_TSV = [] #creating a list to write a csv-file
    # creating the list with our data:
    for k,v in topCountDictionary.items():
        val = "%09d\t%s" % (v, k) # defining the precise syntax for the csv file
        #input(val)
        top_TSV.append(val) # putting the information into the list                  

    # saving the data 
    header = "freq\ttgn\n"
    with open("dispatch_toponyms_%s.tsv" % filter, "w", encoding="utf8") as f9:
        f9.write(header+"\n".join(top_TSV))
    #print(counter) 

#generate("186") and using our function for different years

generate("1861")
generate("1862")
generate("1863")
generate("1864")
generate("1865")

Results:

Results10

Reformating Getty Geographical Thesaurus (from XML)

import re, os

source = "C:/Users/Tatjana Smiljnic/Desktop/univie-tnt-2019.github.io/Lesson_10/tgn_xml_0619/"

def generateTGNdata(source): # function taking in the XML-files

    lof = os.listdir(source) # creating a list out of the files

    tgnList = []
    tgnListNA = []
    count = 0 #general counter to keep track of the progress

    for f in lof:# looping through the list of files
        if f.startswith("TGN"): # fileName test
            print(f)
            with open(source+f, "r", encoding="utf8") as f1: # opening and reading the files
                data = f1.read()

                data = re.split("</Subject>", data) # putting the content of the file into a list and splitting for every Subject

                for d in data: # looping through the whole list of Subjects
                    d = re.sub("\n +", "", d)
                    #print(d)

                    if "Subject_ID" in d:
                        # SUBJECT ID
                        placeID = re.search(r"Subject_ID=\"(\d+)\"", d).group(1)
                        #print(placeID)

                        # NAME OF THE PLACE
                        placeName = re.search(r"<Term_Text>([^<]+)</Term_Text>", d).group(1)
                        #print(placeName)

                        # COORDINATES
                        if "<Coordinates>" in d:
                            latGr = re.search(r"<Latitude>(.*)</Latitude>", d).group(1)# finding latitude
                            lat = re.search(r"<Decimal>(.*)</Decimal>", latGr).group(1)# searching latitude in decimal

                            lonGr = re.search(r"<Longitude>(.*)</Longitude>", d).group(1)#finding longitude
                            lon = re.search(r"<Decimal>(.*)</Decimal>", lonGr).group(1)#searching longitude in decimal 
                            #print(lat) # printing latitude
                            #print(lon) # printing longtude
                        else:
                            lat = "NA" # printing NA for latitude when this above is not available
                            lon = "NA" # printing NA for longitude when this above is not available

                        tgnList.append("\t".join([placeID, placeName, lat, lon])) #creating a list of all the places with coordinates
                        #input(tgnList)

                        if lat == "NA":# making a list of all the places without coordinates (should be printed, but we do not use for final mapping)
                            print("\t"+ "; ".join([placeID, placeName, lat, lon]))
                            tgnListNA.append("\t".join([placeID, placeName, lat, lon]))

    # saving the files
    header = "tgnID\tplacename\tlat\tlon\n"

    with open("tgn_data_light.tsv", "w", encoding="utf8") as f9a:# saving the list with coordinates
         f9a.write(header+"\n".join(tgnList))

    with open("tgn_data_light_NA.tsv", "w", encoding="utf8") as f9b:# saving the list of places without coordinates
         f9b.write(header+"\n".join(tgnListNA))

    print("TGN has %d items" % len(tgnList))#print

generateTGNdata(source)

#TGN has 2,487,572 items
#    17,613 items do not have coordinates.

Results:

Results10

Results10

Matching collected geographical data with Getty Gazetteer

import re, os
# loading TGN data into a dictionary (with a function):
def loadTGN(tgnTSV): # defining the function 
    with open(tgnTSV, "r", encoding="utf8") as f1:# opening the previously created (and saved) tgnTSV-file
        data = f1.read().split("\n")# creating a variable containing the data of the tgnTSV-file separated into lines

        dic = {} #making the dict we will use
        # looping through every line:
        for d in data:
            d = d.split("\t")# splitting the data with TABs

            dic[d[0]] = d # filling the dict with the keys and values

    return(dic)
# matching our data-files with the tgn-data:
def match(freqFile, dicToMatch):# defining the function and its intakes
    with open(freqFile, "r", encoding="utf8") as f1:# opening the previously made and saved freqFile:
        data = f1.read().split("\n")
# creating all the lists we need:  
        dataNew = []
        dataNewNA = []
        count = 0 # general counter to keep track of the progress

        for d in data[1:]: # looping through our toponymes-file
            tgnID = d.split("\t")[1]# getting the tgn-number so we can compare it to the one in the dict, start counting the columns with 0
            freq  = d.split("\t")[0] # getting the frequency so we can write it into a csv

            if tgnID in dicToMatch: # matching places to the dict by tgn-number
                val = "\t".join(dicToMatch[tgnID])# variable for the entry in the dict
                val  = val + "\t" + freq # putting the frequency and the timestamp to the entry in the dict

                if "\tNA\t" in val: # creating lists with the toponymes without and with  coordinates
                    dataNewNA.append(val)
                else:
                    dataNew.append(val)
            else:
                print("%s (%d) not in TGN!" % (tgnID, int(freq)))
                count += 1
    # defining the names of the header:
    header = "tgnID\tplacename\tlat\tlon\tfreq\n"
    #creating a file with all the toponymes with their coordinates:
    with open("coord_"+freqFile, "w", encoding="utf8") as f9a:
        f9a.write(header + "\n".join(dataNew))
    #creating a file with all the toponymes without coordinates:
    with open("coord_NA_"+freqFile, "w", encoding="utf8") as f9b:
        f9b.write(header + "\n".join(dataNewNA))

    print("%d item have not been matched..." % count) # print all the items that could not been matched

dictionary = loadTGN("tgn_data_light.tsv")  # variable for the Getty Gazetteer we previously cleaned up
# running and matching the two functions using our toponyms and the "tgn_data_light.tsv"
match("dispatch_toponyms_1861.tsv", dictionary)
match("dispatch_toponyms_1862.tsv", dictionary)
match("dispatch_toponyms_1863.tsv", dictionary)
match("dispatch_toponyms_1864.tsv", dictionary)
match("dispatch_toponyms_1865.tsv", dictionary)

Results:

Results10

Results10

Please find my old version of the Python code with explanation:

Lesson10

For much better view all steps please follow this link: Lesson10!