Skip to content

Commit 3cdc8bf

Browse files
fix: updating HTML parsing rules to account for [email protected] (#1661)
This makes a significant overhaul to how we interpret HTML, based on the new algorithm provided by ProseMirror, some details about it can be seen in [the PR that addresses it](#1661). Co-authored-by: Matthew Lipski <[email protected]> Co-authored-by: Nick the Sick <[email protected]>
1 parent 56bd6a2 commit 3cdc8bf

File tree

61 files changed

+1718
-202
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+1718
-202
lines changed

packages/core/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@
9494
"hast-util-from-dom": "^5.0.1",
9595
"prosemirror-dropcursor": "^1.8.1",
9696
"prosemirror-highlight": "^0.13.0",
97-
"prosemirror-model": "^1.24.1",
97+
"prosemirror-model": "^1.25.1",
9898
"prosemirror-state": "^1.4.3",
9999
"prosemirror-tables": "^1.6.4",
100100
"prosemirror-transform": "^1.10.2",

packages/core/src/blocks/AudioBlockContent/AudioBlockContent.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@ export const audioParse = (
7878
element: HTMLElement
7979
): Partial<Props<typeof audioBlockConfig.propSchema>> | undefined => {
8080
if (element.tagName === "AUDIO") {
81+
// Ignore if parent figure has already been parsed.
82+
if (element.closest("figure")) {
83+
return undefined;
84+
}
85+
8186
return parseAudioElement(element as HTMLAudioElement);
8287
}
8388

packages/core/src/blocks/CodeBlockContent/CodeBlockContent.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,12 @@ const CodeBlockContent = createStronglyTypedTiptapNode({
144144
},
145145
parseHTML() {
146146
return [
147+
// Parse from internal HTML.
147148
{
148149
tag: "div[data-content-type=" + this.name + "]",
149-
contentElement: "code",
150+
contentElement: ".bn-inline-content",
150151
},
152+
// Parse from external HTML.
151153
{
152154
tag: "pre",
153155
contentElement: "code",

packages/core/src/blocks/FileBlockContent/FileBlockContent.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ export const fileRender = (
4343

4444
export const fileParse = (element: HTMLElement) => {
4545
if (element.tagName === "EMBED") {
46+
// Ignore if parent figure has already been parsed.
47+
if (element.closest("figure")) {
48+
return undefined;
49+
}
50+
4651
return parseEmbedElement(element as HTMLEmbedElement);
4752
}
4853

packages/core/src/blocks/HeadingBlockContent/HeadingBlockContent.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,12 @@ const HeadingBlockContent = createStronglyTypedTiptapNode({
118118
},
119119
parseHTML() {
120120
return [
121+
// Parse from internal HTML.
121122
{
122123
tag: "div[data-content-type=" + this.name + "]",
124+
contentElement: ".bn-inline-content",
123125
},
126+
// Parse from external HTML.
124127
{
125128
tag: "h1",
126129
attrs: { level: 1 },

packages/core/src/blocks/ImageBlockContent/ImageBlockContent.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,11 @@ export const imageParse = (
8989
element: HTMLElement
9090
): Partial<Props<typeof imageBlockConfig.propSchema>> | undefined => {
9191
if (element.tagName === "IMG") {
92+
// Ignore if parent figure has already been parsed.
93+
if (element.closest("figure")) {
94+
return undefined;
95+
}
96+
9297
return parseImageElement(element as HTMLImageElement);
9398
}
9499

packages/core/src/blocks/ListItemBlockContent/BulletListItemBlockContent/BulletListItemBlockContent.ts

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
} from "../../../schema/index.js";
99
import { createDefaultBlockDOMOutputSpec } from "../../defaultBlockHelpers.js";
1010
import { defaultProps } from "../../defaultProps.js";
11+
import { getListItemContent } from "../getListItemContent.js";
1112
import { handleEnter } from "../ListItemKeyboardShortcuts.js";
1213

1314
export const bulletListItemPropSchema = {
@@ -73,10 +74,12 @@ const BulletListItemBlockContent = createStronglyTypedTiptapNode({
7374

7475
parseHTML() {
7576
return [
76-
// Case for regular HTML list structure.
77+
// Parse from internal HTML.
7778
{
7879
tag: "div[data-content-type=" + this.name + "]",
80+
contentElement: ".bn-inline-content",
7981
},
82+
// Parse from external HTML.
8083
{
8184
tag: "li",
8285
getAttrs: (element) => {
@@ -92,36 +95,17 @@ const BulletListItemBlockContent = createStronglyTypedTiptapNode({
9295

9396
if (
9497
parent.tagName === "UL" ||
95-
(parent.tagName === "DIV" && parent.parentElement!.tagName === "UL")
98+
(parent.tagName === "DIV" && parent.parentElement?.tagName === "UL")
9699
) {
97100
return {};
98101
}
99102

100103
return false;
101104
},
102-
node: "bulletListItem",
103-
},
104-
// Case for BlockNote list structure.
105-
{
106-
tag: "p",
107-
getAttrs: (element) => {
108-
if (typeof element === "string") {
109-
return false;
110-
}
111-
112-
const parent = element.parentElement;
113-
114-
if (parent === null) {
115-
return false;
116-
}
117-
118-
if (parent.getAttribute("data-content-type") === "bulletListItem") {
119-
return {};
120-
}
121-
122-
return false;
123-
},
124-
priority: 300,
105+
// As `li` elements can contain multiple paragraphs, we need to merge their contents
106+
// into a single one so that ProseMirror can parse everything correctly.
107+
getContent: (node, schema) =>
108+
getListItemContent(node, schema, this.name),
125109
node: "bulletListItem",
126110
},
127111
];

packages/core/src/blocks/ListItemBlockContent/CheckListItemBlockContent/CheckListItemBlockContent.ts

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import {
1212
} from "../../../schema/index.js";
1313
import { createDefaultBlockDOMOutputSpec } from "../../defaultBlockHelpers.js";
1414
import { defaultProps } from "../../defaultProps.js";
15+
import { getListItemContent } from "../getListItemContent.js";
1516
import { handleEnter } from "../ListItemKeyboardShortcuts.js";
1617

1718
export const checkListItemPropSchema = {
@@ -109,17 +110,24 @@ const checkListItemBlockContent = createStronglyTypedTiptapNode({
109110

110111
parseHTML() {
111112
return [
113+
// Parse from internal HTML.
112114
{
113115
tag: "div[data-content-type=" + this.name + "]",
116+
contentElement: ".bn-inline-content",
114117
},
115-
// Checkbox only.
118+
// Parse from external HTML.
116119
{
117120
tag: "input",
118121
getAttrs: (element) => {
119122
if (typeof element === "string") {
120123
return false;
121124
}
122125

126+
// Ignore if we already parsed an ancestor list item to avoid double-parsing.
127+
if (element.closest("[data-content-type]") || element.closest("li")) {
128+
return false;
129+
}
130+
123131
if ((element as HTMLInputElement).type === "checkbox") {
124132
return { checked: (element as HTMLInputElement).checked };
125133
}
@@ -128,7 +136,6 @@ const checkListItemBlockContent = createStronglyTypedTiptapNode({
128136
},
129137
node: "checkListItem",
130138
},
131-
// Container element for checkbox + label.
132139
{
133140
tag: "li",
134141
getAttrs: (element) => {
@@ -144,7 +151,7 @@ const checkListItemBlockContent = createStronglyTypedTiptapNode({
144151

145152
if (
146153
parent.tagName === "UL" ||
147-
(parent.tagName === "DIV" && parent.parentElement!.tagName === "UL")
154+
(parent.tagName === "DIV" && parent.parentElement?.tagName === "UL")
148155
) {
149156
const checkbox =
150157
(element.querySelector(
@@ -160,6 +167,10 @@ const checkListItemBlockContent = createStronglyTypedTiptapNode({
160167

161168
return false;
162169
},
170+
// As `li` elements can contain multiple paragraphs, we need to merge their contents
171+
// into a single one so that ProseMirror can parse everything correctly.
172+
getContent: (node, schema) =>
173+
getListItemContent(node, schema, this.name),
163174
node: "checkListItem",
164175
},
165176
];

packages/core/src/blocks/ListItemBlockContent/NumberedListItemBlockContent/NumberedListItemBlockContent.ts

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import {
99
} from "../../../schema/index.js";
1010
import { createDefaultBlockDOMOutputSpec } from "../../defaultBlockHelpers.js";
1111
import { defaultProps } from "../../defaultProps.js";
12+
import { getListItemContent } from "../getListItemContent.js";
1213
import { handleEnter } from "../ListItemKeyboardShortcuts.js";
1314
import { NumberedListIndexingPlugin } from "./NumberedListIndexingPlugin.js";
1415

@@ -101,11 +102,12 @@ const NumberedListItemBlockContent = createStronglyTypedTiptapNode({
101102

102103
parseHTML() {
103104
return [
105+
// Parse from internal HTML.
104106
{
105107
tag: "div[data-content-type=" + this.name + "]",
108+
contentElement: ".bn-inline-content",
106109
},
107-
// Case for regular HTML list structure.
108-
// (e.g.: when pasting from other apps)
110+
// Parse from external HTML.
109111
{
110112
tag: "li",
111113
getAttrs: (element) => {
@@ -121,7 +123,7 @@ const NumberedListItemBlockContent = createStronglyTypedTiptapNode({
121123

122124
if (
123125
parent.tagName === "OL" ||
124-
(parent.tagName === "DIV" && parent.parentElement!.tagName === "OL")
126+
(parent.tagName === "DIV" && parent.parentElement?.tagName === "OL")
125127
) {
126128
const startIndex =
127129
parseInt(parent.getAttribute("start") || "1") || 1;
@@ -137,29 +139,10 @@ const NumberedListItemBlockContent = createStronglyTypedTiptapNode({
137139

138140
return false;
139141
},
140-
node: "numberedListItem",
141-
},
142-
// Case for BlockNote list structure.
143-
// (e.g.: when pasting from blocknote)
144-
{
145-
tag: "p",
146-
getAttrs: (element) => {
147-
if (typeof element === "string") {
148-
return false;
149-
}
150-
151-
const parent = element.parentElement;
152-
153-
if (parent === null) {
154-
return false;
155-
}
156-
157-
if (parent.getAttribute("data-content-type") === "numberedListItem") {
158-
return {};
159-
}
160-
161-
return false;
162-
},
142+
// As `li` elements can contain multiple paragraphs, we need to merge their contents
143+
// into a single one so that ProseMirror can parse everything correctly.
144+
getContent: (node, schema) =>
145+
getListItemContent(node, schema, this.name),
163146
priority: 300,
164147
node: "numberedListItem",
165148
},
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import { DOMParser, Fragment, Schema } from "prosemirror-model";
2+
3+
/**
4+
* This function is used to parse the content of a list item external HTML node.
5+
*
6+
* Due to a change in how prosemirror-model handles parsing elements, we have additional flexibility in how we can "fit" content into a list item.
7+
*
8+
* We've decided to take an approach that is similar to Notion. The core rules of the algorithm are:
9+
*
10+
* - If the first child of an `li` has ONLY text content, take the text content, and flatten it into the list item. Subsequent siblings are carried over as is, as children of the list item.
11+
* - e.g. `<li><h1>Hello</h1><p>World</p></li> -> <li>Hello<blockGroup><blockContainer><p>World</p></blockContainer></blockGroup></li>`
12+
* - Else, take the content and insert it as children instead.
13+
* - e.g. `<li><img src="url" /></li> -> <li><p></p><blockGroup><blockContainer><img src="url" /></blockContainer></blockGroup></li>`
14+
*
15+
* This ensures that a list item's content is always valid ProseMirror content. Smoothing over differences between how external HTML may be rendered, and how ProseMirror expects content to be structured.
16+
*/
17+
export function getListItemContent(
18+
/**
19+
* The `li` element to parse.
20+
*/
21+
_node: Node,
22+
/**
23+
* The schema to use for parsing.
24+
*/
25+
schema: Schema,
26+
/**
27+
* The name of the list item node.
28+
*/
29+
name: string
30+
): Fragment {
31+
/**
32+
* To actually implement this algorithm, we need to leverage ProseMirror's "fitting" algorithm.
33+
* Where, if content is parsed which doesn't fit into the current node, it will be moved into the parent node.
34+
*
35+
* This allows us to parse multiple pieces of content from within the list item (even though it normally would not match the list item's schema) and "throw" the excess content into the list item's children.
36+
*
37+
* The expected return value is a `Fragment` which contains the list item's content as the first element, and the children wrapped in a blockGroup node. Like so:
38+
* ```
39+
* Fragment<[Node<Text>, Node<BlockGroup<Node<BlockContainer<any>>>>]>
40+
* ```
41+
*/
42+
const parser = DOMParser.fromSchema(schema);
43+
44+
// TODO: This will be unnecessary in the future: https://github.com/ProseMirror/prosemirror-model/commit/166188d4f9db96eb86fb7de62e72049c86c9dd79
45+
const node = _node as HTMLElement;
46+
47+
// Move the `li` element's content into a new `div` element
48+
// This is a hacky workaround to not re-trigger list item parsing,
49+
// when we are looking to understand what the list item's content actually is, in terms of the schema.
50+
const clonedNodeDiv = document.createElement("div");
51+
// Mark the `div` element as a `blockGroup` to make the parsing easier.
52+
clonedNodeDiv.setAttribute("data-node-type", "blockGroup");
53+
// Clone all children of the `li` element into the new `div` element
54+
for (const child of Array.from(node.childNodes)) {
55+
clonedNodeDiv.appendChild(child.cloneNode(true));
56+
}
57+
58+
// Parses children of the `li` element into a `blockGroup` with `blockContainer` node children
59+
// This is the structure of list item children, so parsing into this structure allows for
60+
// easy separation of list item content from child list item content.
61+
let blockGroupNode = parser.parse(clonedNodeDiv, {
62+
topNode: schema.nodes.blockGroup.create(),
63+
});
64+
65+
// There is an edge case where a list item's content may contain a `<input>` element.
66+
// Causing it to be recognized as a `checkListItem`.
67+
// We want to skip this, and just parse the list item's content as is.
68+
if (blockGroupNode.firstChild?.firstChild?.type.name === "checkListItem") {
69+
// We skip the first child, by cutting it out of the `blockGroup` node.
70+
// and continuing with the rest of the algorithm.
71+
blockGroupNode = blockGroupNode.copy(
72+
blockGroupNode.content.cut(
73+
blockGroupNode.firstChild.firstChild.nodeSize + 2
74+
)
75+
);
76+
}
77+
78+
// Structure above is `blockGroup<blockContainer<any>[]>`
79+
// We want to extract the first `blockContainer` node's content, and see if it is a text block.
80+
const listItemsFirstChild = blockGroupNode.firstChild?.firstChild;
81+
82+
// If the first node is not a text block, then it's first child is not compatible with the list item node.
83+
if (!listItemsFirstChild?.isTextblock) {
84+
// So, we do not try inserting anything into the list item, and instead return anything we found as children for the list item.
85+
return Fragment.from(blockGroupNode);
86+
}
87+
88+
// If it is a text block, then we know it only contains text content.
89+
// So, we extract it, and insert its content into the `listItemNode`.
90+
// The remaining nodes in the `blockGroup` stay in-place.
91+
const listItemNode = schema.nodes[name].create(
92+
{},
93+
listItemsFirstChild.content
94+
);
95+
96+
// We have `blockGroup<listItemsFirstChild, ...blockContainer<any>[]>`
97+
// We want to extract out the rest of the nodes as `<...blockContainer<any>[]>`
98+
const remainingListItemChildren = blockGroupNode.content.cut(
99+
// +2 for the `blockGroup` node's start and end markers
100+
listItemsFirstChild.nodeSize + 2
101+
);
102+
const hasRemainingListItemChildren = remainingListItemChildren.size > 0;
103+
104+
if (hasRemainingListItemChildren) {
105+
// Copy the remaining list item children back into the `blockGroup` node.
106+
// This will make it back into: `blockGroup<...blockContainer<any>[]>`
107+
const listItemsChildren = blockGroupNode.copy(remainingListItemChildren);
108+
109+
// Return the `listItem` node's content, then add the parsed children after to be lifted out by ProseMirror "fitting" algorithm.
110+
return listItemNode.content.addToEnd(listItemsChildren);
111+
}
112+
113+
// Otherwise, just return the `listItem` node's content.
114+
return listItemNode.content;
115+
}

packages/core/src/blocks/ParagraphBlockContent/ParagraphBlockContent.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,14 @@ export const ParagraphBlockContent = createStronglyTypedTiptapNode({
3939

4040
parseHTML() {
4141
return [
42-
{ tag: "div[data-content-type=" + this.name + "]" },
42+
// Parse from internal HTML.
43+
{
44+
tag: "div[data-content-type=" + this.name + "]",
45+
contentElement: ".bn-inline-content",
46+
},
47+
// Parse from external HTML.
4348
{
4449
tag: "p",
45-
priority: 200,
4650
getAttrs: (element) => {
4751
if (typeof element === "string" || !element.textContent?.trim()) {
4852
return false;

0 commit comments

Comments
 (0)