33
44using System . Collections . Immutable ;
55using System . Composition ;
6-
6+ using System . Net ;
77using Docfx . Build . Common ;
88using Docfx . Common ;
99using Docfx . DataContracts . Common ;
1010using Docfx . Plugins ;
11+ using HtmlAgilityPack ;
1112
1213namespace Docfx . Build . ConceptualDocuments ;
1314
1415[ Export ( nameof ( ConceptualDocumentProcessor ) , typeof ( IDocumentBuildStep ) ) ]
1516class BuildConceptualDocument : BaseDocumentBuildStep
1617{
17- private const string ConceptualKey = Constants . PropertyName . Conceptual ;
1818 private const string DocumentTypeKey = "documentType" ;
1919
2020 public override string Name => nameof ( BuildConceptualDocument ) ;
@@ -28,16 +28,16 @@ public override void Build(FileModel model, IHostService host)
2828 return ;
2929 }
3030 var content = ( Dictionary < string , object > ) model . Content ;
31- var markdown = ( string ) content [ ConceptualKey ] ;
31+ var markdown = ( string ) content [ Constants . PropertyName . Conceptual ] ;
3232 var result = host . Markup ( markdown , model . OriginalFileAndType , false ) ;
3333
34- var htmlInfo = HtmlDocumentUtility . SeparateHtml ( result . Html ) ;
35- content [ "rawTitle" ] = htmlInfo . RawTitle ;
36- if ( ! string . IsNullOrEmpty ( htmlInfo . RawTitle ) )
34+ var ( h1 , h1Raw , conceptual ) = ExtractH1 ( result . Html ) ;
35+ content [ "rawTitle" ] = h1Raw ;
36+ if ( ! string . IsNullOrEmpty ( h1Raw ) )
3737 {
38- model . ManifestProperties . rawTitle = htmlInfo . RawTitle ;
38+ model . ManifestProperties . rawTitle = h1Raw ;
3939 }
40- content [ ConceptualKey ] = htmlInfo . Content ;
40+ content [ Constants . PropertyName . Conceptual ] = conceptual ;
4141
4242 if ( result . YamlHeader ? . Count > 0 )
4343 {
@@ -47,13 +47,15 @@ public override void Build(FileModel model, IHostService host)
4747 }
4848 }
4949
50- ( content [ Constants . PropertyName . Title ] , model . Properties . IsUserDefinedTitle ) = GetTitle ( result . YamlHeader , htmlInfo ) ;
50+ content [ Constants . PropertyName . Title ] = GetTitle ( result . YamlHeader , h1 ) ;
51+ content [ "wordCount" ] = WordCounter . CountWord ( conceptual ) ;
5152
5253 model . LinkToFiles = result . LinkToFiles . ToImmutableHashSet ( ) ;
5354 model . LinkToUids = result . LinkToUids ;
5455 model . FileLinkSources = result . FileLinkSources ;
5556 model . UidLinkSources = result . UidLinkSources ;
5657 model . Properties . XrefSpec = null ;
58+
5759 if ( model . Uids . Length > 0 )
5860 {
5961 var title = content [ Constants . PropertyName . Title ] as string ;
@@ -108,31 +110,31 @@ void HandleYamlHeaderPair(string key, object value)
108110 }
109111 }
110112
111- ( string title , bool isUserDefined ) GetTitle ( ImmutableDictionary < string , object > yamlHeader , SeparatedHtmlInfo info )
113+ string GetTitle ( ImmutableDictionary < string , object > yamlHeader , string h1 )
112114 {
113115 // title from YAML header
114116 if ( yamlHeader != null
115117 && TryGetStringValue ( yamlHeader , Constants . PropertyName . Title , out var yamlHeaderTitle ) )
116118 {
117- return ( yamlHeaderTitle , true ) ;
119+ return yamlHeaderTitle ;
118120 }
119121
120122 // title from metadata/titleOverwriteH1
121123 if ( TryGetStringValue ( content , Constants . PropertyName . TitleOverwriteH1 , out var titleOverwriteH1 ) )
122124 {
123- return ( titleOverwriteH1 , true ) ;
125+ return titleOverwriteH1 ;
124126 }
125127
126128 // title from H1
127- if ( ! string . IsNullOrEmpty ( info . Title ) )
129+ if ( ! string . IsNullOrEmpty ( h1 ) )
128130 {
129- return ( info . Title , false ) ;
131+ return h1 ;
130132 }
131133
132134 // title from globalMetadata or fileMetadata
133135 if ( TryGetStringValue ( content , Constants . PropertyName . Title , out var title ) )
134136 {
135- return ( title , true ) ;
137+ return title ;
136138 }
137139
138140 return default ;
@@ -152,4 +154,34 @@ bool TryGetStringValue(IDictionary<string, object> dictionary, string key, out s
152154 }
153155 }
154156 }
157+
158+ static ( string h1 , string h1Raw , string body ) ExtractH1 ( string contentHtml )
159+ {
160+ ArgumentNullException . ThrowIfNull ( contentHtml ) ;
161+
162+ var document = new HtmlDocument ( ) ;
163+ document . LoadHtml ( contentHtml ) ;
164+
165+ // InnerText in HtmlAgilityPack is not decoded, should be a bug
166+ var h1Node = document . DocumentNode . SelectSingleNode ( "//h1" ) ;
167+ var h1 = WebUtility . HtmlDecode ( h1Node ? . InnerText ) ;
168+ var h1Raw = "" ;
169+ if ( h1Node != null && GetFirstNoneCommentChild ( document . DocumentNode ) == h1Node )
170+ {
171+ h1Raw = h1Node . OuterHtml ;
172+ h1Node . Remove ( ) ;
173+ }
174+
175+ return ( h1 , h1Raw , document . DocumentNode . OuterHtml ) ;
176+
177+ static HtmlNode GetFirstNoneCommentChild ( HtmlNode node )
178+ {
179+ var result = node . FirstChild ;
180+ while ( result != null && ( result . NodeType == HtmlNodeType . Comment || string . IsNullOrWhiteSpace ( result . OuterHtml ) ) )
181+ {
182+ result = result . NextSibling ;
183+ }
184+ return result ;
185+ }
186+ }
155187}
0 commit comments