//! Implementation of emitting DWARF debugging information for `*.wat` files. //! //! This is intended to be relatively simple but the goal is to enable emission //! of DWARF sections which point back to the original `*.wat` file itself. This //! enables debuggers like LLDB to debug `*.wat` files without necessarily //! built-in knowledge of WebAssembly itself. //! //! Overall I was curious on weekend and decided to implement this. It's an //! off-by-default crate feature and an off-by-default runtime feature of this //! crate. Hopefully doesn't carry too much complexity with it while still being //! easy/fun to play around with. use crate::core::binary::{EncodeOptions, Encoder, GenerateDwarf, Names, RecOrType}; use crate::core::{InnerTypeKind, Local, ValType}; use crate::token::Span; use gimli::write::{ self, Address, AttributeValue, DwarfUnit, Expression, FileId, LineProgram, LineString, Sections, UnitEntryId, Writer, }; use gimli::{Encoding, Format, LineEncoding, LittleEndian}; use std::cmp::Ordering; use std::collections::HashMap; use std::path::Path; pub struct Dwarf<'a> { // Metadata configured at `Dwarf` creation time func_names: HashMap, func_imports: u32, contents: &'a str, file_id: FileId, style: GenerateDwarf, types: &'a [RecOrType<'a>], dwarf: DwarfUnit, // Next function index when `start_func` is called next_func: u32, // Current `DW_TAG_subprogram` that's being built as part of `start_func`. cur_subprogram: Option, cur_subprogram_instrs: usize, // Code-section-relative offset of the start of every function. Filled in // as part of `end_func`. sym_offsets: Vec, // Metadata tracking what the line/column information was at the specified // last offset. line: u64, column: u64, last_offset: usize, i32_ty: Option, i64_ty: Option, f32_ty: Option, f64_ty: Option, } impl<'a> Dwarf<'a> { /// Creates a new `Dwarf` from the specified configuration. /// /// * `func_imports` - the number of imported functions in this module, or /// the index of the first defined function. /// * `opts` - encoding options, namely where whether DWARF is to be emitted /// is configured. /// * `names` - the `name` custom section for this module, used to name /// functions in DWARF. pub fn new( func_imports: u32, opts: &EncodeOptions<'a>, names: &Names<'a>, types: &'a [RecOrType<'a>], ) -> Option> { // This is a load-bearing `?` which notably short-circuits all DWARF // machinery entirely if this was not enabled at runtime. let (file, contents, style) = opts.dwarf_info?; let file = file.to_str()?; // Configure some initial `gimli::write` context. let encoding = Encoding { address_size: 4, format: Format::Dwarf32, version: 5, }; let mut dwarf = DwarfUnit::new(encoding); let (comp_dir, comp_file) = match ( Path::new(file).parent().and_then(|s| s.to_str()), Path::new(file).file_name().and_then(|s| s.to_str()), ) { (Some(parent), Some(file_name)) if !parent.is_empty() => (parent, file_name), _ => (".", file), }; let comp_dir_ref = dwarf.strings.add(comp_dir); let comp_file_ref = dwarf.strings.add(comp_file); dwarf.unit.line_program = LineProgram::new( encoding, LineEncoding::default(), LineString::StringRef(comp_dir_ref), LineString::StringRef(comp_file_ref), None, ); let dir_id = dwarf.unit.line_program.default_directory(); let file_id = dwarf .unit .line_program .add_file(LineString::StringRef(comp_file_ref), dir_id, None); // Configure a single compilation unit which encompasses the entire code // section. The code section isn't fully known at this point so only a // "low pc" is emitted here. let root = dwarf.unit.root(); let cu = dwarf.unit.get_mut(root); cu.set( gimli::DW_AT_producer, AttributeValue::String(format!("wast {}", env!("CARGO_PKG_VERSION")).into_bytes()), ); cu.set( gimli::DW_AT_language, // Technically this should be something like wasm or wat but that // doesn't exist so fill in something here. AttributeValue::Language(gimli::DW_LANG_C), ); cu.set(gimli::DW_AT_name, AttributeValue::StringRef(comp_file_ref)); cu.set( gimli::DW_AT_comp_dir, AttributeValue::StringRef(comp_dir_ref), ); cu.set(gimli::DW_AT_low_pc, AttributeValue::Data4(0)); // Build a lookup table for defined function index to its name. let mut func_names = HashMap::new(); for (idx, name) in names.funcs.iter() { func_names.insert(*idx, *name); } // Offsets pointing to newlines are considered internally as the 0th // column of the next line, so handle the case that the contents start // with a newline. let (line, column) = if contents.starts_with("\n") { (2, 0) } else { (1, 1) }; Some(Dwarf { dwarf, style, next_func: func_imports, func_imports, sym_offsets: Vec::new(), contents, line, column, last_offset: 0, file_id, cur_subprogram: None, cur_subprogram_instrs: 0, func_names, i32_ty: None, i64_ty: None, f32_ty: None, f64_ty: None, types, }) } /// Start emitting a new function defined at `span`. /// /// This will start a new line program for this function and additionally /// configure a new `DW_TAG_subprogram` for this new function. pub fn start_func(&mut self, span: Span, ty: u32, locals: &[Local<'_>]) { self.change_linecol(span); let addr = Address::Symbol { symbol: (self.next_func - self.func_imports) as usize, addend: 0, }; self.dwarf.unit.line_program.begin_sequence(Some(addr)); let root = self.dwarf.unit.root(); let subprogram = self.dwarf.unit.add(root, gimli::DW_TAG_subprogram); let entry = self.dwarf.unit.get_mut(subprogram); let fallback = format!("wasm-function[{}]", self.next_func); let func_name = self .func_names .get(&self.next_func) .copied() .unwrap_or(&fallback); entry.set(gimli::DW_AT_name, AttributeValue::String(func_name.into())); entry.set( gimli::DW_AT_decl_file, AttributeValue::FileIndex(Some(self.file_id)), ); entry.set(gimli::DW_AT_decl_line, AttributeValue::Udata(self.line)); entry.set(gimli::DW_AT_decl_column, AttributeValue::Udata(self.column)); entry.set(gimli::DW_AT_external, AttributeValue::FlagPresent); entry.set(gimli::DW_AT_low_pc, AttributeValue::Address(addr)); if let GenerateDwarf::Full = self.style { self.add_func_params_and_locals(subprogram, ty, locals); } self.cur_subprogram = Some(subprogram); self.cur_subprogram_instrs = 0; self.next_func += 1; } /// Adds `DW_TAG_formal_parameter` and `DW_TAG_variable` for all locals /// (which are both params and function-defined locals). /// /// This is pretty simple in that the expression for the location of /// these variables is constant, it's just "it's the local", and it spans /// the entire function. fn add_func_params_and_locals( &mut self, subprogram: UnitEntryId, ty: u32, locals: &[Local<'_>], ) { // Iterate through `self.types` which is what was encoded into the // module and find the function type which gives access to the // parameters which gives access to their types. let ty = self .types .iter() .flat_map(|t| match t { RecOrType::Type(t) => std::slice::from_ref(*t), RecOrType::Rec(r) => &r.types, }) .nth(ty as usize); let ty = match ty.map(|t| &t.def.kind) { Some(InnerTypeKind::Func(ty)) => ty, _ => return, }; let mut local_idx = 0; for (_, _, ty) in ty.params.iter() { self.local(local_idx, subprogram, gimli::DW_TAG_formal_parameter, ty); local_idx += 1; } for local in locals { self.local(local_idx, subprogram, gimli::DW_TAG_variable, &local.ty); local_idx += 1; } } /// Attempts to define a local variable within `subprogram` with the `ty` /// given. /// /// This does nothing if `ty` can't be represented in DWARF. fn local(&mut self, local: u32, subprogram: UnitEntryId, tag: gimli::DwTag, ty: &ValType<'_>) { let ty = match self.val_type_to_dwarf_type(ty) { Some(ty) => ty, None => return, }; let param = self.dwarf.unit.add(subprogram, tag); let entry = self.dwarf.unit.get_mut(param); entry.set( gimli::DW_AT_name, AttributeValue::String(format!("local{local}").into()), ); let mut loc = Expression::new(); loc.op_wasm_local(local); loc.op(gimli::DW_OP_stack_value); entry.set(gimli::DW_AT_location, AttributeValue::Exprloc(loc)); entry.set(gimli::DW_AT_type, AttributeValue::UnitRef(ty)); } fn val_type_to_dwarf_type(&mut self, ty: &ValType<'_>) -> Option { match ty { ValType::I32 => Some(self.i32_ty()), ValType::I64 => Some(self.i64_ty()), ValType::F32 => Some(self.f32_ty()), ValType::F64 => Some(self.f64_ty()), // TODO: make a C union of sorts or something like that to // represent v128 as an array-of-lanes or u128 or something like // that. ValType::V128 => None, // Not much that can be done about reference types without actually // knowing what the engine does, this probably needs an addition to // DWARF itself to represent this. ValType::Ref(_) => None, } } /// Emit an instruction which starts at `offset` and is defined at `span`. /// /// Note that `offset` is code-section-relative. pub fn instr(&mut self, offset: usize, span: Span) { self.change_linecol(span); let offset = u64::try_from(offset).unwrap(); let mut changed = false; let row = self.dwarf.unit.line_program.row(); set(&mut row.address_offset, offset, &mut changed); set(&mut row.line, self.line, &mut changed); set(&mut row.column, self.column, &mut changed); set(&mut row.file, self.file_id, &mut changed); set(&mut row.is_statement, true, &mut changed); set( &mut row.prologue_end, self.cur_subprogram_instrs == 0, &mut changed, ); if changed { self.dwarf.unit.line_program.generate_row(); } self.cur_subprogram_instrs += 1; fn set(slot: &mut T, val: T, changed: &mut bool) { if *slot != val { *slot = val; *changed = true; } } } /// Change `self.line` and `self.column` to be appropriate for the offset /// in `span`. /// /// This will incrementally move from `self.last_offset` to `span.offset()` /// and update line/column information as we go. It's assumed that this is /// more efficient than a precalculate-all-the-positions-for-each-byte /// approach since that would require a great deal of memory to store a /// line/col for all bytes in the input string. It's also assumed that most /// `span` adjustments are minor as it's between instructions in a function /// which are frequently close together. Whether or not this assumption /// pans out is yet to be seen. fn change_linecol(&mut self, span: Span) { let offset = span.offset(); loop { match self.last_offset.cmp(&offset) { Ordering::Less => { let haystack = self.contents[self.last_offset + 1..].as_bytes(); let next_newline = match memchr::memchr_iter(b'\n', haystack).next() { Some(pos) => pos + self.last_offset + 1, None => break, }; if next_newline > offset { break; } else { self.line += 1; self.column = 0; self.last_offset = next_newline; } } Ordering::Greater => { let haystack = self.contents[..self.last_offset].as_bytes(); match memchr::memchr_iter(b'\n', haystack).next_back() { Some(prev_newline) => { if self.column == 0 { self.line -= 1; } self.column = 0; self.last_offset = prev_newline; } None => { self.line = 1; self.column = 1; self.last_offset = 0; } } } Ordering::Equal => break, } } match self.last_offset.cmp(&offset) { Ordering::Less => { self.column += (offset - self.last_offset) as u64; } Ordering::Greater => { self.column -= (self.last_offset - offset) as u64; } Ordering::Equal => {} } self.last_offset = offset; } /// Completes emission of the latest function. /// /// The latest function took `func_size` bytes to encode and the current end /// of the code section, after the function was appended, is /// `code_section_end`. pub fn end_func(&mut self, func_size: usize, code_section_end: usize) { // Add a final row corresponding to the final `end` instruction in the // function to ensure there's something for all bytecodes. let row = self.dwarf.unit.line_program.row(); row.address_offset = (func_size - 1) as u64; self.dwarf.unit.line_program.generate_row(); // This function's symbol is relative to the start of the function // itself. Functions are encoded as a leb-size-of-function then the // function itself, so to account for the size of the // leb-size-of-function we calculate the function start as the current // end of the code section minus the size of the function's bytes. self.sym_offsets.push(code_section_end - func_size); // The line program is relative to the start address, so only the // function's size is used here. self.dwarf .unit .line_program .end_sequence(u64::try_from(func_size).unwrap()); // The high PC value here is relative to `DW_AT_low_pc`, so it's the // size of the function. let entry = self.dwarf.unit.get_mut(self.cur_subprogram.take().unwrap()); entry.set( gimli::DW_AT_high_pc, AttributeValue::Data4(func_size as u32), ); } pub fn set_code_section_size(&mut self, size: usize) { let root = self.dwarf.unit.root(); let entry = self.dwarf.unit.get_mut(root); entry.set(gimli::DW_AT_high_pc, AttributeValue::Data4(size as u32)); } pub fn emit(&mut self, dst: &mut Encoder<'_>) { let mut sections = Sections::new(DwarfWriter { sym_offsets: &self.sym_offsets, bytes: Vec::new(), }); self.dwarf.write(&mut sections).unwrap(); sections .for_each(|id, writer| { if !writer.bytes.is_empty() { dst.wasm.section(&wasm_encoder::CustomSection { name: id.name().into(), data: (&writer.bytes).into(), }); } Ok::<_, std::convert::Infallible>(()) }) .unwrap(); } fn i32_ty(&mut self) -> UnitEntryId { if self.i32_ty.is_none() { self.i32_ty = Some(self.mk_primitive("i32", 4, gimli::DW_ATE_signed)); } self.i32_ty.unwrap() } fn i64_ty(&mut self) -> UnitEntryId { if self.i64_ty.is_none() { self.i64_ty = Some(self.mk_primitive("i64", 8, gimli::DW_ATE_signed)); } self.i64_ty.unwrap() } fn f32_ty(&mut self) -> UnitEntryId { if self.f32_ty.is_none() { self.f32_ty = Some(self.mk_primitive("f32", 4, gimli::DW_ATE_float)); } self.f32_ty.unwrap() } fn f64_ty(&mut self) -> UnitEntryId { if self.f64_ty.is_none() { self.f64_ty = Some(self.mk_primitive("f64", 8, gimli::DW_ATE_float)); } self.f64_ty.unwrap() } fn mk_primitive(&mut self, name: &str, byte_size: u8, encoding: gimli::DwAte) -> UnitEntryId { let name = self.dwarf.strings.add(name); let root = self.dwarf.unit.root(); let ty = self.dwarf.unit.add(root, gimli::DW_TAG_base_type); let entry = self.dwarf.unit.get_mut(ty); entry.set(gimli::DW_AT_name, AttributeValue::StringRef(name)); entry.set(gimli::DW_AT_byte_size, AttributeValue::Data1(byte_size)); entry.set(gimli::DW_AT_encoding, AttributeValue::Encoding(encoding)); ty } } #[derive(Clone)] struct DwarfWriter<'a> { sym_offsets: &'a [usize], bytes: Vec, } impl Writer for DwarfWriter<'_> { type Endian = LittleEndian; fn endian(&self) -> Self::Endian { LittleEndian } fn len(&self) -> usize { self.bytes.len() } fn write(&mut self, bytes: &[u8]) -> write::Result<()> { self.bytes.extend_from_slice(bytes); Ok(()) } fn write_at(&mut self, offset: usize, bytes: &[u8]) -> write::Result<()> { self.bytes[offset..][..bytes.len()].copy_from_slice(bytes); Ok(()) } fn write_address(&mut self, address: Address, size: u8) -> write::Result<()> { match address { Address::Constant(val) => self.write_udata(val, size), Address::Symbol { symbol, addend } => { assert_eq!(addend, 0); let offset = self.sym_offsets[symbol]; self.write_udata(offset as u64, size) } } } } #[cfg(test)] mod tests { use super::{Dwarf, EncodeOptions, GenerateDwarf}; use crate::token::Span; use rand::rngs::SmallRng; use rand::{Rng, SeedableRng}; fn linecol_test(contents: &str) { let mut dwarf = Dwarf::new( 0, EncodeOptions::default().dwarf("foo.wat".as_ref(), contents, GenerateDwarf::Lines), &Default::default(), &[], ) .unwrap(); // Print some debugging information in case someone's debugging this // test let mut offset = 0; for (i, line) in contents.lines().enumerate() { println!( "line {:2} is at {:2} .. {:2}", i + 1, offset, offset + line.len() ); offset += line.len() + 1; } println!(""); // Precalculate (line, col) for all characters, assumed to all be one // byte here. let mut precalculated_linecols = Vec::new(); let mut line = 1; let mut col = 1; for c in contents.chars() { if c == '\n' { line += 1; col = 0; } precalculated_linecols.push((line, col)); col += 1; } // Traverse randomly throughout this string and assert that the // incremental update matches the precalculated position. let mut rand = SmallRng::seed_from_u64(102); for _ in 0..1000 { let pos = rand.gen_range(0..contents.len()); dwarf.change_linecol(Span::from_offset(pos)); let (line, col) = precalculated_linecols[pos]; assert_eq!(dwarf.line, line, "line mismatch"); assert_eq!(dwarf.column, col, "column mismatch"); } } #[test] fn linecol_simple() { linecol_test( "a b c (; ... ;) d e f fg", ); } #[test] fn linecol_empty() { linecol_test("x"); } #[test] fn linecol_start_newline() { linecol_test("\nx ab\nyyy \ncc"); } #[test] fn linecol_lots_of_newlines() { linecol_test("\n\n\n\n"); } #[test] fn linecol_interspersed() { linecol_test("\na\nb\nc\n"); } }