kulp · kulp · May 22, 2020 · May 22, 2020 · May 22, 2020 · May 22, 2020
diff --git a/bindgen-integration/build.rs b/bindgen-integration/build.rs
@@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex, RwLock};
 struct MacroCallback {
     macros: Arc<RwLock<HashSet<String>>>,
     seen_hellos: Mutex<u32>,
+    seen_funcs: Mutex<u32>,
 }
 
 impl ParseCallbacks for MacroCallback {
@@ -45,6 +46,10 @@ impl ParseCallbacks for MacroCallback {
 
     fn str_macro(&self, name: &str, value: &[u8]) {
         match name {
+            "TESTMACRO_STRING_EXPR" => {
+                assert_eq!(value, b"string");
+                *self.seen_hellos.lock().unwrap() += 1;
+            }
             "TESTMACRO_STRING_EXPANDED" |
             "TESTMACRO_STRING" |
             "TESTMACRO_INTEGER" => {
@@ -70,15 +75,64 @@ impl ParseCallbacks for MacroCallback {
             _ => None,
         }
     }
+
+    fn func_macro(&self, name: &str, value: &[&[u8]]) {
+        match name {
+            "TESTMACRO_NONFUNCTIONAL" => {
+                panic!("func_macro was called for a non-functional macro");
+            }
+            "TESTMACRO_FUNCTIONAL_NONEMPTY(TESTMACRO_INTEGER)" => {
+                // Spaces are inserted into the right-hand side of a functional
+                // macro during reconstruction from the tokenization. This might
+                // change in the future, but it is safe by the definition of a
+                // token in C, whereas leaving the spaces out could change
+                // tokenization.
+                assert_eq!(value, &[b"-" as &[u8], b"TESTMACRO_INTEGER"]);
+                *self.seen_funcs.lock().unwrap() += 1;
+            }
+            "TESTMACRO_FUNCTIONAL_EMPTY(TESTMACRO_INTEGER)" => {
+                assert_eq!(value, &[] as &[&[u8]]);
+                *self.seen_funcs.lock().unwrap() += 1;
+            }
+            "TESTMACRO_FUNCTIONAL_TOKENIZED(a,b,c,d,e)" => {
+                assert_eq!(
+                    value,
+                    &[b"a" as &[u8], b"/", b"b", b"c", b"d", b"##", b"e"]
+                );
+                *self.seen_funcs.lock().unwrap() += 1;
+            }
+            "TESTMACRO_FUNCTIONAL_SPLIT(a,b)" => {
+                assert_eq!(value, &[b"b", b",", b"a"]);
+                *self.seen_funcs.lock().unwrap() += 1;
+            }
+            "TESTMACRO_STRING_FUNC_NON_UTF8(x)" => {
+                assert_eq!(
+                    value,
+                    &[b"(" as &[u8], b"x", b"\"\xff\xff\"", b")"]
+                );
+                *self.seen_funcs.lock().unwrap() += 1;
+            }
+            _ => {
+                // The system might provide lots of functional macros.
+                // Ensure we did not miss handling one that we meant to handle.
+                assert!(!name.starts_with("TESTMACRO_"), "name = {}", name);
+            }
+        }
+    }
 }
 
 impl Drop for MacroCallback {
     fn drop(&mut self) {
         assert_eq!(
             *self.seen_hellos.lock().unwrap(),
-            2,
+            3,
             "str_macro handle was not called once for all relevant macros"
-        )
+        );
+        assert_eq!(
+            *self.seen_funcs.lock().unwrap(),
+            5,
+            "func_macro handle was not called once for all relevant macros"
+        );
     }
 }
 
@@ -102,6 +156,7 @@ fn main() {
         .parse_callbacks(Box::new(MacroCallback {
             macros: macros.clone(),
             seen_hellos: Mutex::new(0),
+            seen_funcs: Mutex::new(0),
         }))
         .blacklist_function("my_prefixed_function_to_remove")
         .generate()

diff --git a/bindgen-integration/cpp/Test.h b/bindgen-integration/cpp/Test.h
@@ -7,6 +7,19 @@
 #define TESTMACRO_STRING_EXPANDED TESTMACRO_STRING
 #define TESTMACRO_CUSTOMINTKIND_PATH 123
 
+// The following two macros are parsed the same by cexpr, but are semantically
+// different.
+#define TESTMACRO_NONFUNCTIONAL (TESTMACRO_INTEGER)
+#define TESTMACRO_FUNCTIONAL_EMPTY(TESTMACRO_INTEGER)
+#define TESTMACRO_FUNCTIONAL_NONEMPTY(TESTMACRO_INTEGER)-TESTMACRO_INTEGER
+#define TESTMACRO_FUNCTIONAL_TOKENIZED(  a, b   ,c,d,e   ) a/b c    d ## e
+#define TESTMACRO_FUNCTIONAL_SPLIT(  a, \
+        b) b,\
+        a
+//#define TESTMACRO_INVALID("string") // A conforming preprocessor rejects this
+#define TESTMACRO_STRING_EXPR ("string")
+#define TESTMACRO_STRING_FUNC_NON_UTF8(x) (x "��") /* invalid UTF-8 on purpose */
+
 #include <cwchar>
 
 enum {

diff --git a/src/callbacks.rs b/src/callbacks.rs
@@ -35,10 +35,19 @@ pub trait ParseCallbacks: fmt::Debug + UnwindSafe {
         None
     }
 
-    /// This will be run on every string macro. The callback can not influence the further
+    /// This will be run on every string macro. The callback cannot influence the further
     /// treatment of the macro, but may use the value to generate additional code or configuration.
     fn str_macro(&self, _name: &str, _value: &[u8]) {}
 
+    /// This will be run on every function-like macro. The callback cannot
+    /// influence the further treatment of the macro, but may use the value to
+    /// generate additional code or configuration.
+    ///
+    /// The first parameter represents the name and argument list (including the
+    /// parentheses) of the function-like macro. The second parameter represents
+    /// the expansion of the macro as a sequence of tokens.
+    fn func_macro(&self, _name: &str, _value: &[&[u8]]) {}
+
     /// This function should return whether, given an enum variant
     /// name, and value, this enum variant will forcibly be a constant.
     fn enum_variant_behavior(

diff --git a/src/clang.rs b/src/clang.rs
@@ -239,6 +239,17 @@ impl Cursor {
         }
     }
 
+    /// Is this Cursor pointing to a function-like macro definition?
+    /// Returns None if this cannot be determined with the available libclang
+    /// (it requires 3.9 or greater).
+    pub fn is_macro_function_like(&self) -> Option<bool> {
+        if clang_Cursor_isMacroFunctionLike::is_loaded() {
+            Some(unsafe { clang_Cursor_isMacroFunctionLike(self.x) != 0 })
+        } else {
+            None
+        }
+    }
+
     /// Get the kind of referent this cursor is pointing to.
     pub fn kind(&self) -> CXCursorKind {
         self.x.kind
@@ -698,30 +709,9 @@ impl Cursor {
 
     /// Gets the tokens that correspond to that cursor as  `cexpr` tokens.
     pub fn cexpr_tokens(self) -> Vec<cexpr::token::Token> {
-        use cexpr::token;
-
         self.tokens()
             .iter()
-            .filter_map(|token| {
-                let kind = match token.kind {
-                    CXToken_Punctuation => token::Kind::Punctuation,
-                    CXToken_Literal => token::Kind::Literal,
-                    CXToken_Identifier => token::Kind::Identifier,
-                    CXToken_Keyword => token::Kind::Keyword,
-                    // NB: cexpr is not too happy about comments inside
-                    // expressions, so we strip them down here.
-                    CXToken_Comment => return None,
-                    _ => {
-                        error!("Found unexpected token kind: {:?}", token);
-                        return None;
-                    }
-                };
-
-                Some(token::Token {
-                    kind,
-                    raw: token.spelling().to_vec().into_boxed_slice(),
-                })
-            })
+            .filter_map(|token| token.as_cexpr_token())
             .collect()
     }
 
@@ -793,13 +783,16 @@ impl<'a> Drop for RawTokens<'a> {
     }
 }
 
-/// A raw clang token, that exposes only the kind and spelling. This is a
+/// A raw clang token, that exposes only kind, spelling, and extent. This is a
 /// slightly more convenient version of `CXToken` which owns the spelling
-/// string.
+/// string and extent.
 #[derive(Debug)]
 pub struct ClangToken {
     spelling: CXString,
-    /// The kind of token, this is the same as the relevant member from
+    /// The extent of the token. This is the same as the relevant member from
+    /// `CXToken`.
+    pub extent: CXSourceRange,
+    /// The kind of the token. This is the same as the relevant member from
     /// `CXToken`.
     pub kind: CXTokenKind,
 }
@@ -812,6 +805,30 @@ impl ClangToken {
         };
         c_str.to_bytes()
     }
+
+    /// Converts a ClangToken to a `cexpr` token if possible.
+    pub fn as_cexpr_token(&self) -> Option<cexpr::token::Token> {
+        use cexpr::token;
+
+        let kind = match self.kind {
+            CXToken_Punctuation => token::Kind::Punctuation,
+            CXToken_Literal => token::Kind::Literal,
+            CXToken_Identifier => token::Kind::Identifier,
+            CXToken_Keyword => token::Kind::Keyword,
+            // NB: cexpr is not too happy about comments inside
+            // expressions, so we strip them down here.
+            CXToken_Comment => return None,
+            _ => {
+                error!("Found unexpected token kind: {:?}", self);
+                return None;
+            }
+        };
+
+        Some(token::Token {
+            kind,
+            raw: self.spelling().to_vec().into_boxed_slice(),
+        })
+    }
 }
 
 impl Drop for ClangToken {
@@ -834,7 +851,12 @@ impl<'a> Iterator for ClangTokenIterator<'a> {
         unsafe {
             let kind = clang_getTokenKind(*raw);
             let spelling = clang_getTokenSpelling(self.tu, *raw);
-            Some(ClangToken { kind, spelling })
+            let extent = clang_getTokenExtent(self.tu, *raw);
+            Some(ClangToken {
+                kind,
+                extent,
+                spelling,
+            })
         }
     }
 }

diff --git a/src/ir/var.rs b/src/ir/var.rs
@@ -8,6 +8,7 @@ use super::item::Item;
 use super::ty::{FloatKind, TypeKind};
 use crate::callbacks::MacroParsingBehavior;
 use crate::clang;
+use crate::clang::ClangToken;
 use crate::parse::{
     ClangItemParser, ClangSubItemParser, ParseError, ParseResult,
 };
@@ -130,6 +131,71 @@ fn default_macro_constant_type(value: i64) -> IntKind {
     }
 }
 
+/// Determines whether a set of tokens from a CXCursor_MacroDefinition
+/// represent a function-like macro. If so, calls the func_macro callback
+/// and returns `Err(ParseError::Continue)` to signal to skip further
+/// processing. If conversion to UTF-8 fails (it is performed only where it
+/// should be infallible), then `Err(ParseError::Continue)` is returned as well.
+fn handle_function_macro(
+    cursor: &clang::Cursor,
+    tokens: &[ClangToken],
+    callbacks: &dyn crate::callbacks::ParseCallbacks,
+) -> Result<(), ParseError> {
+    fn is_abutting(a: &ClangToken, b: &ClangToken) -> bool {
+        unsafe {
+            clang_sys::clang_equalLocations(
+                clang_sys::clang_getRangeEnd(a.extent),
+                clang_sys::clang_getRangeStart(b.extent),
+            ) != 0
+        }
+    }
+
+    let is_functional_macro =
+        // If we have libclang >= 3.9, we can use `is_macro_function_like()` and
+        // avoid checking for abutting tokens ourselves.
+        cursor.is_macro_function_like().unwrap_or_else(|| {
+            // If we cannot get a definitive answer from clang, we instead check
+            // for a parenthesis token immediately adjacent to (that is,
+            // abutting) the first token in the macro definition.
+            match tokens.get(0..2) {
+                Some([a, b]) => is_abutting(&a, &b) && b.spelling() == b"(",
+                _ => false,
+            }
+        });
+
+    if !is_functional_macro {
+        return Ok(());
+    }
+
+    let is_closing_paren = |t: &ClangToken| {
+        // Test cheap token kind before comparing exact spellings.
+        t.kind == clang_sys::CXToken_Punctuation && t.spelling() == b")"
+    };
+    let boundary = tokens.iter().position(is_closing_paren);
+
+    let mut spelled = tokens.iter().map(ClangToken::spelling);
+    // Add 1, to convert index to length.
+    let left = spelled
+        .by_ref()
+        .take(boundary.ok_or(ParseError::Continue)? + 1);
+    let left = left.collect::<Vec<_>>().concat();
+    let left = String::from_utf8(left).map_err(|_| ParseError::Continue)?;
+    let right = spelled;
+    // Drop last token with LLVM < 4.0, due to an LLVM bug.
+    //
+    // See:
+    //   https://bugs.llvm.org//show_bug.cgi?id=9069
+    let len = match (right.len(), crate::clang_version().parsed) {
+        (len, Some((v, _))) if len > 0 && v < 4 => len - 1,
+        (len, _) => len,
+    };
+    let right: Vec<_> = right.take(len).collect();
+    callbacks.func_macro(&left, &right);
+
+    // We handled the macro, skip future macro processing.
+    Err(ParseError::Continue)
+}
+
 impl ClangSubItemParser for Var {
     fn parse(
         cursor: clang::Cursor,
@@ -140,16 +206,20 @@ impl ClangSubItemParser for Var {
         use clang_sys::*;
         match cursor.kind() {
             CXCursor_MacroDefinition => {
+                let tokens: Vec<_> = cursor.tokens().iter().collect();
+
                 if let Some(callbacks) = ctx.parse_callbacks() {
                     match callbacks.will_parse_macro(&cursor.spelling()) {
                         MacroParsingBehavior::Ignore => {
                             return Err(ParseError::Continue);
                         }
                         MacroParsingBehavior::Default => {}
                     }
+
+                    handle_function_macro(&cursor, &tokens, callbacks)?;
                 }
 
-                let value = parse_macro(ctx, &cursor);
+                let value = parse_macro(ctx, &tokens);
 
                 let (id, value) = match value {
                     Some(v) => v,
@@ -316,11 +386,14 @@ impl ClangSubItemParser for Var {
 /// Try and parse a macro using all the macros parsed until now.
 fn parse_macro(
     ctx: &BindgenContext,
-    cursor: &clang::Cursor,
+    tokens: &[ClangToken],
 ) -> Option<(Vec<u8>, cexpr::expr::EvalResult)> {
     use cexpr::expr;
 
-    let mut cexpr_tokens = cursor.cexpr_tokens();
+    let mut cexpr_tokens: Vec<_> = tokens
+        .iter()
+        .filter_map(ClangToken::as_cexpr_token)
+        .collect();
 
     let parser = expr::IdentifierParser::new(ctx.parsed_macros());