module Bones # This class holds the main functionality: the Bones source- # to-source compilation engine based on algorithmic skeletons. # This class processes command line arguments, makes calls to # the Bones preprocessor and the CAST gem, analyzes the source # code, performs source transformations, instantiates the # skeletons, and finally writes output code to file. class Engine < Common # Locate the skeletons directory. BONES_DIR_SKELETONS = File.join(BONES_DIR,'skeletons') # Set the name of the transformations file as found in the skeleton library. SKELETON_FILE = 'skeletons.txt' # A list of timer files to be found in the skeleton library. TIMER_FILES = ['timer_1_start','timer_1_stop','timer_2_start','timer_2_stop'] # A list of files to be found in the common directory of the skeleton library (excluding timer files). COMMON_FILES = ['prologue','epilogue','mem_prologue','mem_copy_H2D','mem_copy_D2H','mem_epilogue','mem_global'] # The name of the file containing the globals as found in the skeleton library COMMON_GLOBALS = 'globals' # The name of the file containing the header file for the original C code as found in the skeleton library COMMON_HEADER = 'header' # The name of the file containing the globals for the kernel files as found in the skeleton library COMMON_GLOBALS_KERNEL = 'globals_kernel' # The name of the file containing the scheduler code COMMON_SCHEDULER = 'scheduler' # Global timers GLOBAL_TIMERS = 'timer_globals' # The extension of a host file in the skeleton library. See also SKELETON_DEVICE. SKELETON_HOST = '.host' # The extension of a device file in the skeleton library. See also SKELETON_HOST. SKELETON_DEVICE = '.kernel' # The suffix added to the generated output file for the host file. See also OUTPUT_DEVICE. OUTPUT_HOST = '_host' # The suffix added to the generated output file for the device file. See also OUTPUT_HOST. OUTPUT_DEVICE = '_device' # The suffix added to the generated verification file. See also OUTPUT_DEVICE and OUTPUT_HOST. OUTPUT_VERIFICATION = '_verification' # Initializes the engine and processes the command line # arguments. This method uses the 'trollop' gem to parse # the arguments and to create a nicely formatted help menu. # This method additionally initializes a result-hash and # reads the contents of the source file from disk. # # ==== Command-line usage: # bones --application --target [OPTIONS] # # ==== Options: # --application, -a : Input application file # --target, -t : Target processor (choose from: 'GPU-CUDA','GPU-OPENCL-AMD','CPU-OPENCL-INTEL','CPU-OPENCL-AMD','CPU-OPENMP','CPU-C') # --measurements, -m: Enable/disable timers # --version, -v: Print version and exit # --help, -h: Show this message # def initialize @result = {:original_code => [], :header_code => [], :host_declarations => [], :host_code_lists => [], :algorithm_declarations => [], :algorithm_code_lists => [], :verify_code => [], :host_device_mem_globals => []} @state = 0 # Provides a list of possible targets (e.g. GPU-CUDA, 'CPU-OPENCL-INTEL'). targets = [] Dir[File.join(BONES_DIR_SKELETONS,'*')].each do |entry| if (File.directory?(entry)) && !(entry =~ /verification/) targets.push(File.basename(entry)) end end targets = targets.sort # Parse the command line options using the 'trollop' gem. pp_targets = targets.inspect.gsub(/("|\[)|\]/,'') @options = Trollop::options do version 'Bones '+File.read(BONES_DIR+'/VERSION').strip+' (c) 2012 Cedric Nugteren, Eindhoven University of Technology' banner NL+'Bones is a parallelizing source-to-source compiler based on algorithmic skeletons. ' + 'For more information, see the README.rdoc file or visit the Bones website at http://parse.ele.tue.nl/bones/.' + NL + NL + 'Usage:' + NL + ' bones --application --target [OPTIONS]' + NL + 'using the following flags:' opt :application, 'Input application file', :short => 'a', :type => String opt :target, 'Target processor (choose from: '+pp_targets+')', :short => 't', :type => String opt :measurements, 'Enable/disable timers', :short => 'm', :default => false opt :verify, 'Verify correctness of the generated code', :short => 'c', :default => false opt :only_alg_number, 'Only generate code for the x-th species (99 -> all)', :short => 'o', :type => Integer, :default => 99 opt :merge_factor, 'Thread merge factor, default is 1 (==disabled)', :short => 'f', :type => Integer, :default => 0 opt :register_caching,'Enable register caching: 1:enabled (default), 0:disabled', :short => 'r', :type => Integer, :default => 1 opt :zero_copy ,'Enable OpenCL zero-copy: 1:enabled (default), 0:disabled', :short => 'z', :type => Integer, :default => 1 opt :skeletons ,'Enable non-default skeletons: 1:enabled (default), 0:disabled', :short => 's', :type => Integer, :default => 1 end Trollop::die 'no input file supplied (use: --application)' if !@options[:application_given] Trollop::die 'no target supplied (use: --target)' if !@options[:target_given] Trollop::die 'input file "'+@options[:application]+'" does not exist' if !File.exists?(@options[:application]) Trollop::die 'target not supported, supported targets are: '+pp_targets if !targets.include?(@options[:target].upcase) @options[:name] = File.basename(@options[:application], ".*") @options[:target] = @options[:target].upcase # Extension for the host files corresponding to the target. @extension = File.extname(Dir[File.join(BONES_DIR_SKELETONS,@options[:target],'common','*')][0]) # Extension for the device files corresponding to the target. @algorithm_extension = File.extname(Dir[File.join(BONES_DIR_SKELETONS,@options[:target],'kernel','*.kernel.*')][0]) # Set a prefix for functions called from the original file but defined in a host file @prefix = (@options[:target] == 'GPU-CUDA') ? '' : '' # Setting to include the scheduler (CUDA only) @scheduler = (@options[:target] == 'GPU-CUDA') ? true : false # Skip analyse passes for certain targets @skiptarget = false #(@options[:target] == 'PAR4ALL') ? true : false # Set the location for the skeleton library @dir = {} @dir[:library] = File.join(BONES_DIR_SKELETONS,@options[:target]) @dir[:skeleton_library] = File.join(@dir[:library],'kernel') @dir[:common_library] = File.join(@dir[:library],'common') @dir[:verify_library] = File.join(BONES_DIR_SKELETONS,'verification') # Obtain the source code from file @source = File.open(@options[:application],'r'){|f| f.read} @basename = File.basename(@options[:application],'.c') end # Method to process a file and to output target code. This # method calls all relevant private methods. # # ==== Tasks: # * Run the preprocessor to obtain algorithm information. # * Use the 'CAST' gem to parse the source into an AST. # * Call the code generator to perform the real work and produce output. def process # Run the preprocessor preprocessor = Bones::Preprocessor.new(@source,File.dirname(@options[:application]),@basename,@scheduler) preprocessor.process @result[:header_code] = preprocessor.header_code @result[:device_header] = preprocessor.device_header @result[:header_code] += '#include '+NL if @options[:measurements] # Parse the source code into AST parser = C::Parser.new parser.type_names << 'FILE' parser.type_names << 'size_t' ast = parser.parse(preprocessor.target_code) ast.preprocess # Add the scheduler's global code if @scheduler @result[:host_code_lists].push(File.read(File.join(@dir[:common_library],COMMON_SCHEDULER+@extension))) end # Set the algorithm's skeleton and generate the global code one_time = true preprocessor.algorithms.each_with_index do |algorithm,algorithm_number| algorithm.species.set_skeleton(File.join(@dir[:library],SKELETON_FILE)) if @options[:skeletons] == 0 algorithm.species.skeleton_name = 'default' algorithm.species.settings.gsub!('10','00').gsub!('20','00').gsub!('30','00') end if algorithm.species.skeleton_name && one_time @result[:host_code_lists].push(File.read(File.join(@dir[:common_library],COMMON_GLOBALS+@extension))) @result[:algorithm_code_lists].push(File.read(File.join(@dir[:common_library],COMMON_GLOBALS_KERNEL+@extension))) one_time = false end end # Perform code generation (per-species code) @result[:original_code] = ast arrays = [] preprocessor.algorithms.each_with_index do |algorithm,algorithm_number| if @options[:only_alg_number] == 99 || algorithm_number == [@options[:only_alg_number],preprocessor.algorithms.length-1].min puts MESSAGE+'Starting code generation for algorithm "'+algorithm.name+'"' if algorithm.species.skeleton_name algorithm.merge_factor = @options[:merge_factor] if (@options[:target] == 'GPU-CUDA') algorithm.register_caching_enabled = @options[:register_caching] algorithm.set_function(ast) algorithm.populate_variables(ast,preprocessor.defines) if !@skiptarget algorithm.populate_lists() algorithm.populate_hash() if !@skiptarget generate(algorithm) puts MESSAGE+'Code generated using the "'+algorithm.species.skeleton_name+'" skeleton' arrays.concat(algorithm.arrays) else puts WARNING+'Skeleton "'+algorithm.species.name+'" not available' end end end # Only if the scheduler is included if @scheduler # Perform code generation (sync statements) @result[:host_declarations].push('void bones_synchronize(int bones_task_id);') # Perform code generation (memory allocs) allocs = [] preprocessor.copies.each do |copy| name_scop = Set.new([copy.name, copy.scop]) if !allocs.include?(name_scop) generate_memory('alloc',copy,arrays,0) allocs << name_scop end end # Perform code generation (memory copies) preprocessor.copies.each_with_index do |copy,index| #puts MESSAGE+'Generating copy code for array "'+copy.name+'"' generate_memory('copy',copy,arrays,index) end # Perform code generation (memory frees) frees = [] preprocessor.copies.each do |copy| name_scop = Set.new([copy.name, copy.scop]) if !frees.include?(name_scop) generate_memory('free',copy,arrays,0) frees << name_scop end end end end # This method writes the output code to files. It creates # a new directory formatted as 'name_target' and produces # three files. # # ==== Output files: # * +main+ - a file containing the original code with function calls substituting the original algorithms. # * +target+ - a file containing the host code for the target. # * +kernel+ - a file containing the kernel code for the target. def write_output # Create a new directory for the output directory = @options[:application].rpartition('.').first+'_'+@options[:target] Dir.mkdir(directory,0744) unless File.directory?(directory) parser = C::Parser.new parser.type_names << 'FILE' parser.type_names << 'size_t' # Populate the main file File.open(File.join(directory,@options[:application].split(File::SEPARATOR).last),'w') do |main| main.puts '#include ' if @options[:verify] main.puts @result[:header_code] main.puts File.read(File.join(@dir[:common_library],COMMON_HEADER+@extension)) main.puts @result[:host_declarations] main.puts begin main.puts parser.parse(@result[:original_code]).to_s rescue puts WARNING+'Recovering from CAST parse error' main.puts parser.parse(@result[:original_code].clone).to_s end end # Populate the verification file if @options[:verify] File.open(File.join(directory,@options[:name]+OUTPUT_VERIFICATION+@extension),'w') do |verification| verification.puts @result[:header_code] verification.puts File.read(File.join(@dir[:verify_library],'header.c')) verification.puts verification.puts @result[:verify_code] end end # Populate the target file (host) File.open(File.join(directory,@options[:name]+OUTPUT_HOST+@extension),'w') do |target| target.puts '#include '+NL if @options[:target] == 'GPU-CUDA' target.puts "#define ZEROCOPY 0"+NL if @options[:zero_copy] == 0 && @options[:target] == 'CPU-OPENCL-INTEL' target.puts "#define ZEROCOPY 1"+NL if @options[:zero_copy] == 1 && @options[:target] == 'CPU-OPENCL-INTEL' target.puts @result[:header_code] target.puts target.puts @result[:host_device_mem_globals].uniq target.puts target.puts @result[:algorithm_declarations] target.puts @result[:host_code_lists] target.puts target.puts File.read(File.join(@dir[:common_library],GLOBAL_TIMERS+@extension)) end # Populate the algorithm file (device) File.open(File.join(directory,@options[:name]+OUTPUT_DEVICE+@algorithm_extension),'w') do |algorithm| algorithm.puts @result[:device_header] algorithm.puts @result[:algorithm_code_lists] end end # Start of the class's private methods. private # This method takes as an input an indivual algorithm and # generates the corresponding output code. The method first # creates a search-and-replace hash, after which it instan- # tiates a skeleton. # # This method returns a message informing the user whether # the code was succesfully generated or the skeleton was # not available. def generate(algorithm) # Determine the skeleton filenames and load them skeletons from the skeleton library file_name_host = File.join(@dir[:skeleton_library],algorithm.species.skeleton_name+SKELETON_HOST) file_name_device = File.join(@dir[:skeleton_library],algorithm.species.skeleton_name+SKELETON_DEVICE) if !File.exists?(file_name_host+@extension) || !File.exists?(file_name_device+@algorithm_extension) raise_error('Skeleton files for skeleton "'+algorithm.species.skeleton_name+'" not available') end skeletons = {:host => File.read(file_name_host+@extension), :device => File.read(file_name_device+@algorithm_extension)} # Perform the transformations on the algorithm's code algorithm.perform_transformations(algorithm.species.settings) if !@skiptarget # Load the common skeletons from the skeleton library COMMON_FILES.each do |skeleton| skeletons[skeleton.to_sym] = File.read(File.join(@dir[:common_library],skeleton+@extension)) end # Load the timer code from the skeleton library (only if the '--measurements' flag is given) TIMER_FILES.each do |skeleton| skeletons[skeleton.to_sym] = @options[:measurements] ? File.read(File.join(@dir[:common_library],skeleton+@extension)) : '' end # Perform search-and-replace on the device skeleton search_and_replace!(algorithm.hash,skeletons[:device]) skeletons[:device].remove_extras # Replace mathematical functions with their equivalent device functions if @options[:target] == 'GPU-CUDA' math_functions = {:sqrt => 'sqrtf', :max => 'fmaxf', :min => 'fminf'} math_functions.each do |original, replacement| skeletons[:device].gsub!(/\b#{original}\(/,replacement+'(') end end # Create the algorithm declaration list from the header supplied in the skeletons algorithm_declaration = skeletons[:device].scan(/#{START_DEFINITION}(.+)#{END_DEFINITION}/m).join.strip.remove_extras @result[:algorithm_declarations].push(algorithm_declaration) # Remove the (commented) algorithm declaration from the code and push the skeleton to the output @result[:algorithm_code_lists].push(skeletons[:device].gsub!(/#{START_DEFINITION}(.+)#{END_DEFINITION}/m,'')) # Setup some variables to create the host body function including memory allocation and memory copies processed = {:mem_prologue => '', :mem_copy_H2D => '', :mem_copy_D2H => '', :mem_epilogue => ''} counter = {:out => 0, :in => 0} # Iterate over all the array variables and create a mini-search-and-replace hash for each array (all arrays) algorithm.arrays.each_with_index do |array, arrayid| minihash = { :array => array.name, :type => array.type_name, :flatten => array.flatten, :variable_dimensions => array.size.join('*'), :state => @state.to_s} @state += 1 # Apply the mini-search-and-replace hash to create the memory allocations, memory copies (if input only), etc. processed[:mem_prologue] += search_and_replace(minihash,skeletons[:mem_prologue]) processed[:mem_copy_H2D] += search_and_replace(minihash,skeletons[:mem_copy_H2D]) if array.input? || array.species.shared? processed[:mem_epilogue] += search_and_replace(minihash,skeletons[:mem_epilogue]) # Add the device declarations @result[:host_device_mem_globals].push(search_and_replace(minihash,skeletons[:mem_global])) end # Iterate over all the array variables and create a mini-search-and-replace hash for each array (output arrays) algorithm.arrays.select(OUTPUT).each_with_index do |array, num_array| hash = algorithm.hash["out#{num_array}".to_sym] minihash = { :array => array.name, :type => array.type_name, :flatten => array.flatten, :offset => '('+hash[:dimension0][:from]+')', :variable_dimensions => '('+hash[:dimensions]+')', :state => @state.to_s} @state += 1 # Perform selective copy for arrays with 2 dimensions (uses a for-loop over the memory copies) if array.dimensions == 2 && @options[:target] == 'GPU-CUDA' && false x_from = '('+hash[:dimension0][:from]+')' x_to = '('+hash[:dimension0][:to]+')' x_sum = '('+hash[:dimension0][:sum]+')' x_size = array.size[0] y_from = '('+hash[:dimension1][:from]+')' y_to = '('+hash[:dimension1][:to]+')' y_sum = '('+hash[:dimension1][:sum]+')' y_size = array.size[1] processed[:mem_copy_D2H] += NL+INDENT+"for(int bones_x=#{x_from}; bones_x<=#{x_to}; bones_x++) {"+INDENT*2 minihash[:offset] = "(bones_x*#{y_size})+#{y_from}" minihash[:variable_dimensions] = "#{y_sum}" # Don't do selective copy for multi-dimensional arrays (yet) elsif array.dimensions > 1 minihash[:offset] = '0' minihash[:variable_dimensions] = array.size.join('*') end # Apply the mini-search-and-replace hash to create the memory copies from device to host processed[:mem_copy_D2H] += search_and_replace(minihash,skeletons[:mem_copy_D2H]) if array.dimensions == 2 && @options[:target] == 'GPU-CUDA' && false processed[:mem_copy_D2H] += INDENT+'}' end end # Apply the search-and-replace hash to all timer skeletons and the host skeleton (['host']+TIMER_FILES).each do |skeleton| search_and_replace!(algorithm.hash,skeletons[skeleton.to_sym]) end # Repair some invalid syntax that could have been introduced by performing the search-and-replace skeletons[:host].remove_extras # Run the prologue/epilogue code through the search-and-replace hash search_and_replace!(algorithm.hash,skeletons[:prologue]) search_and_replace!(algorithm.hash,skeletons[:epilogue]) # Construct the final host function, inluding the timers and memory copies if @scheduler host = skeletons[:prologue ] + skeletons[:timer_2_start] + skeletons[:host ] + skeletons[:timer_2_stop ] + skeletons[:epilogue ] else host = skeletons[:prologue ] + skeletons[:timer_1_start] + processed[:mem_prologue ] + processed[:mem_copy_H2D ] + skeletons[:timer_2_start] + skeletons[:host ] + skeletons[:timer_2_stop ] + processed[:mem_copy_D2H ] + processed[:mem_epilogue ] + skeletons[:timer_1_stop ] + skeletons[:epilogue ] end # Generate code to replace the original code, including verification code if specified by the option flag verify_skeleton = File.read(File.join(@dir[:verify_library],'verify_results.c')) timer_start = (@options[:measurements]) ? File.read(File.join(@dir[:verify_library],'timer_start.c')) : '' timer_stop = (@options[:measurements]) ? File.read(File.join(@dir[:verify_library],'timer_stop.c')) : '' replacement_code, original_definition, verify_definition = algorithm.generate_replacement_code(@options, verify_skeleton, @result[:verify_code], @prefix, timer_start, timer_stop) @result[:host_declarations].push(verify_definition) # Add a performance model to the original code #replacement_code.insert(0,algorithm.performance_model_code('model')) # Replace mallocs and frees in the original code with aligned memory allocations (only for CPU-OpenCL targets with zero-copy) if @options[:zero_copy] == 1 && @options[:target] == 'CPU-OPENCL-INTEL' @result[:original_code].search_and_replace_function_call(C::Variable.parse('malloc'),C::Variable.parse(VARIABLE_PREFIX+'malloc_128')) @result[:original_code].search_and_replace_function_call(C::Variable.parse('free'),C::Variable.parse(VARIABLE_PREFIX+'free_128')) end # Give the original main function a new name @result[:original_code].search_and_replace_function_definition('main',VARIABLE_PREFIX+'main') # Replace the original code with a function call to the newly generated code @result[:original_code].search_and_replace_node(algorithm.code,replacement_code) # The host code is generated, push the data to the output hashes accelerated_definition = 'void '+algorithm.name+'_accelerated('+algorithm.lists[:host_definition]+')' @result[:host_code_lists].push(@prefix+accelerated_definition+' {'+NL+host+NL+'}'+NL+NL) @result[:host_declarations].push(@prefix+accelerated_definition+';'+NL+@prefix+original_definition+';') end def generate_memory(type,copy,arrays,index) # Find the corresponding array arrays.each do |array| if array.name == copy.name && (array.direction == copy.direction || array.direction == INOUT) # Load the skeleton from the skeleton library type += copy.direction if type == 'copy' skeleton = File.read(File.join(@dir[:common_library],'mem_async_'+type+@extension)) # Create the find-and-replace hash minihash = { :array => copy.name, :id => copy.id, :index => index.to_s, :direction => copy.direction, :definition => array.definition, :type => array.type_name, :flatten => array.flatten, :offset => '0', :variable_dimensions => array.size.join('*'), :state => copy.deadline} # Instanstiate the skeleton and add it to the final result @result[:host_code_lists].push(search_and_replace(minihash,skeleton)) # Add a forward declaration of this function @result[:host_declarations].push(copy.get_definition(array.definition,type)) # Done return end end end end end