lawsoc-scraper/convert_json_address_space.py

65 lines
2.0 KiB
Python

#!/usr/bin/python3
# Clean up address data in JSON files by removing leading spaces from each line of the address.
import os
import json
# not currently used
def convert_year(date):
""" convert date string d/m/y to a standardized format. """
d, m, y = date.split('/')
return "/".join(['%d%s' % (19 if int(y) >= 50 else 20, y), m, d])
def write_json_file(workspace, file, data):
""" write data to a JSON with sorted keys. """
file = workspace + file
print(file)
print(data)
with open(file, "w") as jsonFile:
json_contents = json.dumps(data, sort_keys=True, indent=4)
print(json_contents)
jsonFile.write(json_contents)
def documents(workspace, fileList):
""" load each file's JSON content and yielding the file name along with its contents."""
print("Entering documents function")
print("length of fileList: " + str(len(fileList)))
# for file in fileList:
for index, file in enumerate(fileList):
print(index, '--->', file)
# For each file in the list, read its json and provide a generator
contents = json.load(open(workspace + file))
# print(contents)
yield file, contents
# workspace_home = '/var/tmp/lawsoc_new/'
workspace_home = os.getcwd()
workspace = workspace_home + "/json_out/"
office_workspace = workspace + "office/"
if __name__ == '__main__':
""" """
office_fileList = os.listdir(office_workspace)
for file, data in documents(office_workspace, office_fileList):
tmp = data["location"]
# print("Office fileList: ", office_fileList)
# print("Office workspace: ", office_workspace)
print("File: ", file)
# print(type(tmp))
# print(len(tmp))
address = data["address"]
print(type(address))
# for line in address:
# print(line)
print(list(map(str.strip, address)))
new_address = list(map(str.strip, address))
data["address"] = new_address
write_json_file(office_workspace, file, data)