این زیرنویسها فرمت ویژهای دارند:
<li class="transcript-module"> Introduction to ASP.NET MVC 4 <ul> <li class="transcript-clip" data-p="author=scott-allen&name=mvc4-building-m1-intro&mode=live&clip=0&course=mvc4-building"><a href="javascript:void(0)" onclick="LaunchPlayerWindow('http://pluralsight.com/training', 'author=scott-allen&name=mvc4-building-m1-intro&mode=live&clip=0&course=mvc4-building');">Introduction</a><br /> <div> <a href="javascript:void(0)" onclick="p(this);" data-s="1.636">Hi, this is Scott Allen and this is the first module in the course design</a> </div> </li> <li class="transcript-clip" data-p="author=scott-allen&name=mvc4-building-m1-intro&mode=live&clip=1&course=mvc4-building"><a href="javascript:void(0)" onclick="LaunchPlayerWindow('http://pluralsight.com/training', 'author=scott-allen&name=mvc4-building-m1-intro&mode=live&clip=1&course=mvc4-building');">Web Platform Installer</a><br /> <div> ...
public class TranscriptClip { public string Title { set; get; } public IList<TranscriptItem> TranscriptItems { set; get; } } public class TranscriptItem { public double StartTime { set; get; } public string Text { set; get; } }
برای استخراج این اطلاعات، یکی از بهترین ابزارها، کتابخانه HTML Agility pack است که توسط آن میتوان به liهای یاد شده دسترسی یافت:
var nodes = doc.DocumentNode.SelectNodes("//li[@class='transcript-clip']/div");
using System; using System.Collections.Generic; using System.Globalization; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Web; using HtmlAgilityPack; namespace PluralsightTranscripts { public class TranscriptClip { public string Title { set; get; } public IList<TranscriptItem> TranscriptItems { set; get; } } public class TranscriptItem { public double StartTime { set; get; } public string Text { set; get; } } public class ExtractSubtitle { public static void ConvertToSrt(string fileName) { var transcriptClips = extractItems(fileName); var itemNumber = 1; foreach (var item in transcriptClips) { transcriptClipToSrt(item, itemNumber); itemNumber++; } } private static void transcriptClipToSrt(TranscriptClip item, int itemNumber) { var count = item.TranscriptItems.Count; var srtFileContent = transcriptItemsToSrt(item.TranscriptItems, count); var fileName = removeIllegalCharacters(string.Format("{0}-{1}.srt", itemNumber.ToString("00"), item.Title)); File.WriteAllText(fileName, srtFileContent); } private static string transcriptItemsToSrt(IList<TranscriptItem> items, int count) { var lineNumber = 1; var sb = new StringBuilder(); for (int row = 0; row < count; row++) { sb.AppendLine(lineNumber.ToString(CultureInfo.InvariantCulture)); sb.AppendLine(getTimeLine(items, count, row)); sb.AppendLine(items[row].Text); sb.AppendLine(string.Empty); lineNumber++; } return sb.ToString(); } private static string getTimeLine(IList<TranscriptItem> items, int count, int row) { var startTs = TimeSpan.FromSeconds(items[row].StartTime); var endTs = row + 1 < count ? TimeSpan.FromSeconds(items[row + 1].StartTime) : TimeSpan.FromSeconds(items[row].StartTime + 5); return string.Format("{0} --> {1}", timeSpanToString(startTs), timeSpanToString(endTs)); } private static string timeSpanToString(TimeSpan lineTs) { return string.Format("{0}:{1}:{2},{3}", lineTs.Hours.ToString("D2"), lineTs.Minutes.ToString("D2"), lineTs.Seconds.ToString("D2"), lineTs.Milliseconds.ToString("D3")); } private static string removeIllegalCharacters(string fileName) { string regexSearch = string.Format("{0}{1}", new string(Path.GetInvalidFileNameChars()), new string(Path.GetInvalidPathChars())); var r = new Regex(string.Format("[{0}]", Regex.Escape(regexSearch))); return r.Replace(fileName, "."); } private static IList<TranscriptClip> extractItems(string fileName) { var htmlContent = File.ReadAllText(fileName); var results = new List<TranscriptClip>(); var doc = new HtmlDocument { OptionCheckSyntax = true, OptionFixNestedTags = true, OptionAutoCloseOnEnd = true, OptionDefaultStreamEncoding = Encoding.UTF8 }; doc.LoadHtml(htmlContent); var nodes = doc.DocumentNode.SelectNodes("//li[@class='transcript-clip']/div"); foreach (var node in nodes) { var itemsList = new List<TranscriptItem>(); var title = node.ParentNode.ChildNodes.First(x => x.Name == "a").InnerText; foreach (var childNode in node.ChildNodes) { if (childNode.Name != "a") continue; var dataS = childNode.Attributes.First(x => x.Name == "data-s"); itemsList.Add(new TranscriptItem { StartTime = double.Parse(dataS.Value), Text = HttpUtility.HtmlDecode(childNode.InnerText.Trim()) }); } results.Add(new TranscriptClip { TranscriptItems = itemsList, Title = title }); } return results; } } }
فرمت SRT ساختار سادهای دارد. هر گفتگوی آن حداقل از سه سطر تشکیل میشود. سطر اول یک شماره خود افزاینده است. سطر دوم زمان شروع و پایان گفتگو را مشخص میکند و سطر سوم بیانگر متن گفتگو است. برای مثال:
1 00:00:01,636 --> 00:00:05,616 Hi, this is Scott Allen and this is the first module in the course design
دریافت پروژه کامل این مطلب
PluralsightTranscripts.zip