Skip to content

Commit b312ac1

Browse files
authored
Doc gen: Migrate aggregate functions doc to attribute based. (#13646)
* Doc gen: Migrate aggregate functions doc to attribute based.
1 parent 2ac8af8 commit b312ac1

28 files changed

+689
-725
lines changed

datafusion-cli/Cargo.lock

Lines changed: 81 additions & 69 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/core/src/bin/print_functions_docs.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,29 @@ fn print_window_docs() -> Result<String> {
8484
print_docs(providers, window_doc_sections::doc_sections())
8585
}
8686

87+
// Temporary method useful to semi automate
88+
// the migration of UDF documentation generation from code based
89+
// to attribute based
90+
// To be removed
91+
fn save_doc_code_text(documentation: &Documentation, name: &str) {
92+
let attr_text = documentation.to_doc_attribute();
93+
94+
let file_path = format!("{}.txt", name);
95+
if std::path::Path::new(&file_path).exists() {
96+
std::fs::remove_file(&file_path).unwrap();
97+
}
98+
99+
// Open the file in append mode, create it if it doesn't exist
100+
let mut file = std::fs::OpenOptions::new()
101+
.append(true) // Open in append mode
102+
.create(true) // Create the file if it doesn't exist
103+
.open(file_path)
104+
.unwrap();
105+
106+
use std::io::Write;
107+
file.write_all(attr_text.as_bytes()).unwrap();
108+
}
109+
87110
fn print_docs(
88111
providers: Vec<Box<dyn DocProvider>>,
89112
doc_sections: Vec<DocSection>,
@@ -158,6 +181,9 @@ fn print_docs(
158181
unreachable!()
159182
};
160183

184+
// Temporary for doc gen migration, see `save_doc_code_text` comments
185+
save_doc_code_text(documentation, &name);
186+
161187
// first, the name, description and syntax example
162188
let _ = write!(
163189
docs,

datafusion/doc/src/lib.rs

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,90 @@ impl Documentation {
6363
) -> DocumentationBuilder {
6464
DocumentationBuilder::new(doc_section, description, syntax_example)
6565
}
66+
67+
/// Output the `Documentation` struct in form of custom Rust documentation attributes
68+
/// It is useful to semi automate during tmigration of UDF documentation
69+
/// generation from code based to attribute based and can be safely removed after
70+
pub fn to_doc_attribute(&self) -> String {
71+
let mut result = String::new();
72+
73+
result.push_str("#[user_doc(");
74+
// Doc Section
75+
result.push_str(
76+
format!(
77+
"\n doc_section({}label = \"{}\"{}),",
78+
if !self.doc_section.include {
79+
"include = \"false\", "
80+
} else {
81+
""
82+
},
83+
self.doc_section.label,
84+
self.doc_section
85+
.description
86+
.map(|s| format!(", description = \"{}\"", s))
87+
.unwrap_or_default(),
88+
)
89+
.as_ref(),
90+
);
91+
92+
// Description
93+
result.push_str(format!("\n description=\"{}\",", self.description).as_ref());
94+
// Syntax Example
95+
result.push_str(
96+
format!("\n syntax_example=\"{}\",", self.syntax_example).as_ref(),
97+
);
98+
// SQL Example
99+
result.push_str(
100+
&self
101+
.sql_example
102+
.clone()
103+
.map(|s| format!("\n sql_example = r#\"{}\"#,", s))
104+
.unwrap_or_default(),
105+
);
106+
107+
let st_arg_token = " expression to operate on. Can be a constant, column, or function, and any combination of operators.";
108+
// Standard Arguments
109+
if let Some(args) = self.arguments.clone() {
110+
args.iter().for_each(|(name, value)| {
111+
if value.contains(st_arg_token) {
112+
if name.starts_with("The ") {
113+
result.push_str(format!("\n standard_argument(\n name = \"{}\"),", name).as_ref());
114+
} else {
115+
result.push_str(format!("\n standard_argument(\n name = \"{}\",\n prefix = \"{}\"\n ),", name, value.replace(st_arg_token, "")).as_ref());
116+
}
117+
}
118+
});
119+
}
120+
121+
// Arguments
122+
if let Some(args) = self.arguments.clone() {
123+
args.iter().for_each(|(name, value)| {
124+
if !value.contains(st_arg_token) {
125+
result.push_str(format!("\n argument(\n name = \"{}\",\n description = \"{}\"\n ),", name, value).as_ref());
126+
}
127+
});
128+
}
129+
130+
if let Some(alt_syntax) = self.alternative_syntax.clone() {
131+
alt_syntax.iter().for_each(|syntax| {
132+
result.push_str(
133+
format!("\n alternative_syntax = \"{}\",", syntax).as_ref(),
134+
);
135+
});
136+
}
137+
138+
// Related UDFs
139+
if let Some(related_udf) = self.related_udfs.clone() {
140+
related_udf.iter().for_each(|udf| {
141+
result
142+
.push_str(format!("\n related_udf(name = \"{}\"),", udf).as_ref());
143+
});
144+
}
145+
146+
result.push_str("\n)]");
147+
148+
result
149+
}
66150
}
67151

68152
#[derive(Debug, Clone, PartialEq)]

datafusion/functions-aggregate/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,11 @@ ahash = { workspace = true }
4242
arrow = { workspace = true }
4343
arrow-schema = { workspace = true }
4444
datafusion-common = { workspace = true }
45+
datafusion-doc = { workspace = true }
4546
datafusion-execution = { workspace = true }
4647
datafusion-expr = { workspace = true }
4748
datafusion-functions-aggregate-common = { workspace = true }
49+
datafusion-macros = { workspace = true }
4850
datafusion-physical-expr = { workspace = true }
4951
datafusion-physical-expr-common = { workspace = true }
5052
half = { workspace = true }

datafusion/functions-aggregate/src/approx_distinct.rs

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,19 @@ use datafusion_common::ScalarValue;
3131
use datafusion_common::{
3232
downcast_value, internal_err, not_impl_err, DataFusionError, Result,
3333
};
34-
use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
34+
use datafusion_doc::DocSection;
3535
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
3636
use datafusion_expr::utils::format_state_name;
3737
use datafusion_expr::{
3838
Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
3939
};
40+
use datafusion_macros::user_doc;
4041
use std::any::Any;
4142
use std::fmt::{Debug, Formatter};
4243
use std::hash::Hash;
4344
use std::marker::PhantomData;
4445
use std::sync::OnceLock;
46+
4547
make_udaf_expr_and_func!(
4648
ApproxDistinct,
4749
approx_distinct,
@@ -243,6 +245,20 @@ impl Default for ApproxDistinct {
243245
}
244246
}
245247

248+
#[user_doc(
249+
doc_section(label = "Approximate Functions"),
250+
description = "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.",
251+
syntax_example = "approx_distinct(expression)",
252+
sql_example = r#"```sql
253+
> SELECT approx_distinct(column_name) FROM table_name;
254+
+-----------------------------------+
255+
| approx_distinct(column_name) |
256+
+-----------------------------------+
257+
| 42 |
258+
+-----------------------------------+
259+
```"#,
260+
standard_argument(name = "expression",)
261+
)]
246262
pub struct ApproxDistinct {
247263
signature: Signature,
248264
}
@@ -309,25 +325,6 @@ impl AggregateUDFImpl for ApproxDistinct {
309325
}
310326

311327
fn documentation(&self) -> Option<&Documentation> {
312-
Some(get_approx_distinct_doc())
328+
self.doc()
313329
}
314330
}
315-
316-
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
317-
318-
fn get_approx_distinct_doc() -> &'static Documentation {
319-
DOCUMENTATION.get_or_init(|| {
320-
Documentation::builder(DOC_SECTION_APPROXIMATE, "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.", "approx_distinct(expression)")
321-
.with_sql_example(r#"```sql
322-
> SELECT approx_distinct(column_name) FROM table_name;
323-
+-----------------------------------+
324-
| approx_distinct(column_name) |
325-
+-----------------------------------+
326-
| 42 |
327-
+-----------------------------------+
328-
```"#,
329-
)
330-
.with_standard_argument("expression", None)
331-
.build()
332-
})
333-
}

datafusion/functions-aggregate/src/approx_median.rs

Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,14 @@ use arrow::{datatypes::DataType, datatypes::Field};
2525
use arrow_schema::DataType::{Float64, UInt64};
2626

2727
use datafusion_common::{not_impl_err, plan_err, Result};
28-
use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
28+
use datafusion_doc::DocSection;
2929
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
3030
use datafusion_expr::type_coercion::aggregates::NUMERICS;
3131
use datafusion_expr::utils::format_state_name;
3232
use datafusion_expr::{
3333
Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility,
3434
};
35+
use datafusion_macros::user_doc;
3536

3637
use crate::approx_percentile_cont::ApproxPercentileAccumulator;
3738

@@ -44,6 +45,20 @@ make_udaf_expr_and_func!(
4445
);
4546

4647
/// APPROX_MEDIAN aggregate expression
48+
#[user_doc(
49+
doc_section(label = "Approximate Functions"),
50+
description = "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.",
51+
syntax_example = "approx_median(expression)",
52+
sql_example = r#"```sql
53+
> SELECT approx_median(column_name) FROM table_name;
54+
+-----------------------------------+
55+
| approx_median(column_name) |
56+
+-----------------------------------+
57+
| 23.5 |
58+
+-----------------------------------+
59+
```"#,
60+
standard_argument(name = "expression",)
61+
)]
4762
pub struct ApproxMedian {
4863
signature: Signature,
4964
}
@@ -122,29 +137,6 @@ impl AggregateUDFImpl for ApproxMedian {
122137
}
123138

124139
fn documentation(&self) -> Option<&Documentation> {
125-
Some(get_approx_median_doc())
140+
self.doc()
126141
}
127142
}
128-
129-
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
130-
131-
fn get_approx_median_doc() -> &'static Documentation {
132-
DOCUMENTATION.get_or_init(|| {
133-
Documentation::builder(
134-
DOC_SECTION_APPROXIMATE,
135-
"Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.",
136-
137-
"approx_median(expression)")
138-
.with_sql_example(r#"```sql
139-
> SELECT approx_median(column_name) FROM table_name;
140-
+-----------------------------------+
141-
| approx_median(column_name) |
142-
+-----------------------------------+
143-
| 23.5 |
144-
+-----------------------------------+
145-
```"#,
146-
)
147-
.with_standard_argument("expression", None)
148-
.build()
149-
})
150-
}

datafusion/functions-aggregate/src/approx_percentile_cont.rs

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ use datafusion_common::{
3535
downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err,
3636
DataFusionError, Result, ScalarValue,
3737
};
38-
use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE;
38+
use datafusion_doc::DocSection;
3939
use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
4040
use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
4141
use datafusion_expr::utils::format_state_name;
@@ -46,6 +46,7 @@ use datafusion_expr::{
4646
use datafusion_functions_aggregate_common::tdigest::{
4747
TDigest, TryIntoF64, DEFAULT_MAX_SIZE,
4848
};
49+
use datafusion_macros::user_doc;
4950
use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
5051

5152
create_func!(ApproxPercentileCont, approx_percentile_cont_udaf);
@@ -64,6 +65,28 @@ pub fn approx_percentile_cont(
6465
approx_percentile_cont_udaf().call(args)
6566
}
6667

68+
#[user_doc(
69+
doc_section(label = "Approximate Functions"),
70+
description = "Returns the approximate percentile of input values using the t-digest algorithm.",
71+
syntax_example = "approx_percentile_cont(expression, percentile, centroids)",
72+
sql_example = r#"```sql
73+
> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
74+
+-------------------------------------------------+
75+
| approx_percentile_cont(column_name, 0.75, 100) |
76+
+-------------------------------------------------+
77+
| 65.0 |
78+
+-------------------------------------------------+
79+
```"#,
80+
standard_argument(name = "expression",),
81+
argument(
82+
name = "percentile",
83+
description = "Percentile to compute. Must be a float value between 0 and 1 (inclusive)."
84+
),
85+
argument(
86+
name = "centroids",
87+
description = "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory."
88+
)
89+
)]
6790
pub struct ApproxPercentileCont {
6891
signature: Signature,
6992
}
@@ -272,33 +295,10 @@ impl AggregateUDFImpl for ApproxPercentileCont {
272295
}
273296

274297
fn documentation(&self) -> Option<&Documentation> {
275-
Some(get_approx_percentile_cont_doc())
298+
self.doc()
276299
}
277300
}
278301

279-
static DOCUMENTATION: OnceLock<Documentation> = OnceLock::new();
280-
281-
fn get_approx_percentile_cont_doc() -> &'static Documentation {
282-
DOCUMENTATION.get_or_init(|| {
283-
Documentation::builder(
284-
DOC_SECTION_APPROXIMATE,
285-
"Returns the approximate percentile of input values using the t-digest algorithm.",
286-
"approx_percentile_cont(expression, percentile, centroids)")
287-
.with_sql_example(r#"```sql
288-
> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name;
289-
+-------------------------------------------------+
290-
| approx_percentile_cont(column_name, 0.75, 100) |
291-
+-------------------------------------------------+
292-
| 65.0 |
293-
+-------------------------------------------------+
294-
```"#)
295-
.with_standard_argument("expression", None)
296-
.with_argument("percentile", "Percentile to compute. Must be a float value between 0 and 1 (inclusive).")
297-
.with_argument("centroids", "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory.")
298-
.build()
299-
})
300-
}
301-
302302
#[derive(Debug)]
303303
pub struct ApproxPercentileAccumulator {
304304
digest: TDigest,

0 commit comments

Comments
 (0)