# encoding: utf-8 module Util class TextCleaner # Takes in text file, cleans data and writes to new text file. def clean_data(data) clean_controls(data) end # Cleans control information from passed in file def clean_controls(extracted_data) controls_data = isolate_controls_data(extracted_data) clean_section_header = remove_section_header(controls_data) clean_whitespace = remove_newline_in_controls(clean_section_header) clean_special = remove_special(clean_whitespace) clean_no_space = remove_extra_space(clean_special) clean_pagenum = remove_pagenum(clean_no_space) separate_controls(clean_pagenum) end # Removes everything before and after the controls def isolate_controls_data(extracted_data) extracted_data = extracted_data.gsub(/\| P a g e+/, "| P a g e\n") extracted_data = extracted_data.split("\n").map(&:strip).reject { |e| e.to_s.empty? }.join("\n") extracted_data = extracted_data.gsub('???', '') /^1\.1\s*[^)]*?(?=\)$)(.*\n)*?(?=\s*Appendix:)/.match(extracted_data).to_s end # Removes all pagenumbers between the controls def remove_pagenum(extracted_data) clean_pagenum = extracted_data.gsub(/(\d{1,3}\|Page|\d{1,3} \| P a g e)/, '').to_s clean_pagenum.gsub(/(\d{1,3} \| Page)/, '').to_s end # Removes section headers for each control def remove_section_header(extracted_data) extracted_data.gsub(/(?