# scan.py - Read raw data from the 2020 Census redistricting files # in order to create text data files that are easier to work with. # Output is intended to be concise while containing useful data about # demographics and geography at various levels of detail from states to blocks. # This program is written in Python 3 and takes less than 10 minutes to run. # A line in a typical output file includes: # name of area; total population, white, black, Hispanic, Asian; # latitude and longitude, area in acres # # The largest output files by far are block.txt and block0.txt. # The difference is that block0.txt includes blocks of zero population. # Output files with "2" in their names count only voting age (18+) population. # The program assumes that you have a subdirectory "raw" that contains the # raw data files downloaded from the Census Bureau. # Puerto Rico can be included by adding the string "pr" to the stateList. # # As of April 2022, the Census Bureau's data and documentation are here: # https://www.census.gov/programs-surveys/decennial-census/about/rdo/summary-files.html import re stateList = [ "al", "ak", "az", "ar", "ca", "co", "ct", "de", "dc", "fl", \ "ga", "hi", "id", "il", "in", "ia", "ks", "ky", "la", \ "me", "md", "ma", "mi", "mn", "ms", "mo", "mt", "ne", \ "nv", "nh", "nj", "nm", "ny", "nc", "nd", "oh", "ok", \ "or", "pa", "ri", "sc", "sd", "tn", "tx", "ut", "vt", \ "va", "wa", "wv", "wi", "wy" ] stateOutFile = open("state.txt", "w") stateOutFile2 = open("state2.txt", "w") congOutFile = open("cong.txt", "w") congOutFile2 = open("cong2.txt", "w") upperOutFile = open("upper.txt", "w") upperOutFile2 = open("upper2.txt", "w") lowerOutFile = open("lower.txt", "w") lowerOutFile2 = open("lower2.txt", "w") countyOutFile = open("county.txt", "w") countyOutFile2 = open("county2.txt", "w") # < 18 subOutFile = open("sub.txt", "w") # County subdivision subOutFile2 = open("sub2.txt", "w") placeOutFile = open("place.txt", "w") placeOutFile2 = open("place2.txt", "w") tractOutFile = open("tract.txt", "w") tractOutFile2 = open("tract2.txt", "w") # < 18 bgOutFile = open("bg.txt", "w") bgOutFile2 = open("bg2.txt", "w") blockOutFile = open("block.txt", "w") blockZeroOutFile = open("block0.txt", "w") # the unabridged list of blocks voteOutFile = open("vote.txt", "w") # Its code is 635 voteOutFile2 = open("vote2.txt", "w") # The input files delimit data by |. ''' tok = re.compile("|") tokenlist = tok.split(line) tokenlist = [ s for s in tokenlist if len(s) > 0 ] ''' # Dictionary to store county names. Key is state/number, value inside # is the name. print("Preliminary scan to read county names.") countyDict = { } for abbr in stateList: print(abbr) geoFileName = "raw/" + abbr + "geo2020.pl" try: # Read as binary to avoid error about reading Spanish n~ letters. # A little later, need to convert bytes to a string. geoFile = open(geoFileName, "rb") except IOError: continue # The | needs to be escaped. And we want to include empty tokens. for geoLine in geoFile: # convert bytes to a string geoLine = str(geoLine) #print(geoLine) tok = re.compile("\|") tokenlist = tok.split(geoLine) # The summary level is the 3rd token. if tokenlist[2] != "050": continue stateNum = tokenlist[12] countyNum = tokenlist[14] countyName = tokenlist[87] # Make sure the county name is at least 20 characters in length. while len(countyName) < 20: countyName += " " countyDict[(stateNum, countyNum)] = countyName geoFile.close() # Tract scan. print("Beginning the tract scan.") for abbr in stateList: print (abbr) geoFileName = "raw/" + abbr + "geo2020.pl" dataFileName = "raw/" + abbr + "000012020.pl" dataFileName2 = "raw/" + abbr + "000022020.pl" # for 18+ pop only try: geoFile = open(geoFileName, "rb") # open as binary to support n~ dataFile = open(dataFileName, "r") dataFile2 = open(dataFileName2, "r") except IOError: continue for geoLine in geoFile: geoLine = str(geoLine) # convert from binary to string dataLine = dataFile.readline() # Oops: read reads whole file! dataLine2 = dataFile2.readline() tok = re.compile("\|") tokenlist = tok.split(geoLine) # Don't remove empty tokens. summaryLevel = tokenlist[2] if summaryLevel != "140" and summaryLevel != "150" and \ summaryLevel != "750" and summaryLevel != "040" and \ summaryLevel != "050" and summaryLevel != "160" and \ summaryLevel != "500" and summaryLevel != "610" and \ summaryLevel != "620" and summaryLevel != "635" and \ summaryLevel != "060": continue # We want state, county, tract ID. name = tokenlist[87] stateNum = tokenlist[12] countyNum = tokenlist[14] subID = tokenlist[17] tractNum = tokenlist[32] bgNum = tokenlist[33] blockNum = tokenlist[34] stateAbbr = tokenlist[1] land = tokenlist[84] # in square meters try: acres = int(land) * 640.0 / 2589988.0 except ValueError: acres = 0 # The lat/lon are too precise and have stray characters, # e.g. +40.1234567 or -099.1234567 # We can get by on just 3 dec places, e.g. 40123 and 099123 # Corrected to include the prepended + and - signs. lat = tokenlist[92] lat = lat[0:3] + lat[4:7] lon = tokenlist[93] lon = lon[0:4] + lon[5:8] # Now, let's finally turn our attention to the data. # Tokenize... dataTokenList = tok.split(dataLine) dataTokenList2 = tok.split(dataLine2) total = dataTokenList[76] w = dataTokenList[80] b = dataTokenList[81] h = dataTokenList[77] a = dataTokenList[83] # Also grab adult values from 2nd data file. total2 = dataTokenList2[76] w2 = dataTokenList2[80] b2 = dataTokenList2[81] h2 = dataTokenList2[77] a2 = dataTokenList2[83] # Output. # We should not look up a county name in case of state(040), # place(160), cong(500), upper(610), lower(620) if summaryLevel != "040" and summaryLevel != "160" and \ summaryLevel != "500" and summaryLevel != "610" and \ summaryLevel != "620": countyName = countyDict[(stateNum, countyNum)] countyName = countyName[0:20] # The bg and block output only differ in also including the bg/blk number. # First, let's handle block0.txt output. This is the only file # where we want to include 0 population. if summaryLevel == "750": blockZeroOutFile.write(stateNum + countyNum + " " + \ stateAbbr + " " + countyName + " " + \ tractNum + " " + blockNum + " " + \ total.rjust(5) + " " + \ w.rjust(5) + " " + b.rjust(5) + " " + \ h.rjust(5) + " " + a.rjust(5) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") if int(total) != 0: blockOutFile.write(stateNum + countyNum + " " + \ stateAbbr + " " + countyName + " " + \ tractNum + " " + blockNum + " " + \ total.rjust(5) + " " + \ w.rjust(5) + " " + b.rjust(5) + " " + \ h.rjust(5) + " " + a.rjust(5) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") continue # from now on, zero population we don't want to print out if int(total) == 0: continue # Because the map program will skip first 30 char on line, let's keep # the place name to just 26 chars exactly. name = name[0:26] while len(name) < 26: name += " " if summaryLevel == "040": # state stateOutFile.write(stateAbbr + " " + total.rjust(8) + " " + \ w.rjust(8) + " " + b.rjust(8) + " " + \ h.rjust(8) + " " + a.rjust(8) + "\n") stateOutFile2.write(stateAbbr + " " + total2.rjust(8) + " " + \ w2.rjust(8) + " " + b2.rjust(8) + " " + \ h2.rjust(8) + " " + a2.rjust(8) + "\n") elif summaryLevel == "050": # county countyOutFile.write(stateNum + countyNum + " " + \ stateAbbr + " " + countyName + " " + \ total.rjust(8) + " " + w.rjust(7) + " " + \ b.rjust(7) + " " + h.rjust(7) + " " + \ a.rjust(7) + " " + lat + " " + lon + " " + \ str(int(acres)) + "\n") countyOutFile2.write(stateNum + countyNum + " " + \ stateAbbr + " " + countyName + " " + \ total2.rjust(8) + " " + w2.rjust(7) + " " + \ b2.rjust(7) + " " + h2.rjust(7) + " " + \ a2.rjust(7) + " " + lat + " " + lon + " " + \ str(int(acres)) + "\n") elif summaryLevel == "060": # county subdivision subOutFile.write(stateNum + countyNum + " " + \ stateAbbr + " " + countyName + " " + \ name[0:26] + " " + \ total.rjust(8) + " " + w.rjust(7) + " " + \ b.rjust(7) + " " + h.rjust(7) + " " + \ a.rjust(7) + " " + lat + " " + lon + " " + \ str(int(acres)) + "\n") subOutFile2.write(stateNum + countyNum + " " + \ stateAbbr + " " + countyName + " " + \ name[0:26] + " " + \ total2.rjust(8) + " " + w2.rjust(7) + " " + \ b2.rjust(7) + " " + h2.rjust(7) + " " + \ a2.rjust(7) + " " + lat + " " + lon + " " + \ str(int(acres)) + "\n") # Handle cong, upper, lower the same way. elif summaryLevel == "500": # cong district congOutFile.write(stateAbbr + " " + name[0:26] + " " + \ total.rjust(7) + " " + \ w.rjust(7) + " " + b.rjust(7) + " " + \ h.rjust(7) + " " + a.rjust(7) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") congOutFile2.write(stateAbbr + " " + name[0:26] + " " + \ total2.rjust(7) + " " + \ w2.rjust(7) + " " + b2.rjust(7) + " " + \ h2.rjust(7) + " " + a2.rjust(7) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") elif summaryLevel == "610": # upper house upperOutFile.write(stateAbbr + " " + name[0:26] + " " + \ total.rjust(7) + " " + \ w.rjust(7) + " " + b.rjust(7) + " " + \ h.rjust(7) + " " + a.rjust(7) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") upperOutFile2.write(stateAbbr + " " + name[0:26] + " " + \ total2.rjust(7) + " " + \ w2.rjust(7) + " " + b2.rjust(7) + " " + \ h2.rjust(7) + " " + a2.rjust(7) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") elif summaryLevel == "620": # lower house lowerOutFile.write(stateAbbr + " " + name[0:26] + " " + \ total.rjust(7) + " " + \ w.rjust(7) + " " + b.rjust(7) + " " + \ h.rjust(7) + " " + a.rjust(7) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") lowerOutFile2.write(stateAbbr + " " + name[0:26] + " " + \ total2.rjust(7) + " " + \ w2.rjust(7) + " " + b2.rjust(7) + " " + \ h2.rjust(7) + " " + a2.rjust(7) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") elif summaryLevel == "635": # precinct voteOutFile.write(stateAbbr + " " + countyName + " " + name[0:26] + " " + \ total.rjust(7) + " " + \ w.rjust(7) + " " + b.rjust(7) + " " + \ h.rjust(7) + " " + a.rjust(7) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") voteOutFile2.write(stateAbbr + " " + countyName + " " + name[0:26] + " " + \ total2.rjust(7) + " " + \ w2.rjust(7) + " " + b2.rjust(7) + " " + \ h2.rjust(7) + " " + a2.rjust(7) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") elif summaryLevel == "160": # place placeOutFile.write(stateAbbr + " " + name + " " + \ total.rjust(7) + " " + w.rjust(7) + " " + \ b.rjust(7) + " " + h.rjust(7) + " " + \ a.rjust(7) + " " + lat + " " + lon + " " + str(int(acres)) + "\n") placeOutFile2.write(stateAbbr + " " + name + " " + \ total2.rjust(7) + " " + w2.rjust(7) + " " + \ b2.rjust(7) + " " + h2.rjust(7) + " " + \ a2.rjust(7) + " " + lat + " " + lon + " " + str(int(acres)) + "\n") elif summaryLevel == "140": # tract tractOutFile.write(stateNum + countyNum + " " + \ stateAbbr + " " + countyName + " " + \ tractNum + " " + total.rjust(5) + " " + \ w.rjust(5) + " " + b.rjust(5) + " " + \ h.rjust(5) + " " + a.rjust(5) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") tractOutFile2.write(stateNum + countyNum + " " + \ stateAbbr + " " + countyName + " " + \ tractNum + " " + total2.rjust(5) + " " + \ w2.rjust(5) + " " + b2.rjust(5) + " " + \ h2.rjust(5) + " " + a2.rjust(5) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") elif summaryLevel == "150": # BG bgOutFile.write(stateNum + countyNum + " " + \ stateAbbr + " " + countyName + " " + \ tractNum + " " + bgNum + " " + \ total.rjust(5) + " " + \ w.rjust(5) + " " + b.rjust(5) + " " + \ h.rjust(5) + " " + a.rjust(5) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") bgOutFile2.write(stateNum + countyNum + " " + \ stateAbbr + " " + countyName + " " + \ tractNum + " " + bgNum + " " + \ total2.rjust(5) + " " + \ w2.rjust(5) + " " + b2.rjust(5) + " " + \ h2.rjust(5) + " " + a2.rjust(5) + " " + \ lat + " " + lon + " " + \ str(int(acres)) + "\n") else: print ("What should I do with summaryLevel " + summaryLevel) geoFile.close() dataFile.close() dataFile2.close() stateOutFile.close() stateOutFile2.close() congOutFile.close() congOutFile2.close() upperOutFile.close() upperOutFile2.close() lowerOutFile.close() lowerOutFile2.close() countyOutFile.close() countyOutFile2.close() subOutFile.close() subOutFile2.close() placeOutFile.close() placeOutFile2.close() tractOutFile.close() tractOutFile2.close() bgOutFile.close() bgOutFile2.close() blockOutFile.close() blockZeroOutFile.close() voteOutFile.close() voteOutFile2.close()