package main import ( "code.google.com/p/go-html-transform/h5" "code.google.com/p/go-html-transform/html/transform" "code.google.com/p/go.net/html" "code.google.com/p/go.tools/blog/atom" "encoding/xml" "fmt" "io" "net/http" "os" "strconv" "time" ) func getattr(attrs []html.Attribute, name string) (val string, found bool) { for _, a := range attrs { if a.Key == name { val = a.Val found = true return } } return } func Textify(node *html.Node) string { switch node.Type { case html.TextNode: return node.Data case html.ElementNode: for _, att := range node.Attr { if att.Key == "alt" { return att.Val } } fallthrough case html.DocumentNode: text := "" for n := node.FirstChild; n != nil; n = n.NextSibling { text += Textify(n) } return text default: return "" } } type Tweet struct { Content string From string ID string Date time.Time } func ScrapeTweets(r io.Reader) ([]Tweet, error) { t, err := transform.NewFromReader(r) if err != nil { return nil, fmt.Errorf("Could not scrape profile: %s", err) } tweets := make([]Tweet, 0) t.Apply(func(node *html.Node) { var tweet Tweet tweet.From, _ = getattr(node.Attr, "data-screen-name") tweet.ID, _ = getattr(node.Attr, "data-item-id") time_ok := false tree := h5.NewTree(node) t2 := transform.New(&tree) t2.Apply(func(node *html.Node) { if ts, ok := getattr(node.Attr, "data-time"); ok { if ts_int, err := strconv.ParseInt(ts, 10, 64); err == nil { tweet.Date = time.Unix(ts_int, 0) time_ok = true } } }, "a.ProfileTweet-timestamp span") if !time_ok { return } t2.Apply(func(node *html.Node) { tweet.Content = Textify(node) }, ".ProfileTweet-text") tweets = append(tweets, tweet) }, "div.GridTimeline .ProfileTweet") return tweets, nil } const titlelimit = 80 func (t Tweet) Atomify() *atom.Entry { entry := new(atom.Entry) entry.Title = "@" + t.From + ": " + t.Content if len([]rune(entry.Title)) > titlelimit { entry.Title = string([]rune(entry.Title)[:titlelimit-2]) + " …" } url := "https://twitter.com/" + t.From + "/status/" + t.ID entry.ID = url entry.Link = []atom.Link{atom.Link{ Rel: "alternate", Href: url, }} entry.Summary = &atom.Text{Type: "text", Body: t.Content} entry.Content = &atom.Text{Type: "text", Body: t.Content} entry.Author = &atom.Person{ Name: "@" + t.From, URI: "https://twitter.com/" + t.From, } entry.Published = atom.Time(t.Date) entry.Updated = atom.Time(t.Date) return entry } func main() { os.Exit(Main()) } func Main() int { if len(os.Args) < 2 { fmt.Fprintln(os.Stderr, "Need one argument (twitter user name, without the '@')") return 1 } user := os.Args[1] resp, err := http.Get("https://twitter.com/" + user) if err != nil { fmt.Fprintf(os.Stderr, "Couldn't download @%s's stream: %s\n", user, err) return 1 } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { fmt.Fprintf(os.Stderr, "Couldn't download @%s's stream: HTTP Status %d %s\n", user, resp.StatusCode, resp.Status) return 1 } tweets, err := ScrapeTweets(resp.Body) if err != nil { fmt.Fprintln(os.Stderr, err) } feed := atom.Feed{ Title: "Tweets from @" + user, ID: "https://twitter.com/" + user, Link: []atom.Link{ atom.Link{ Rel: "alternate", Href: "http://twitter.com/" + user, }, }, Author: &atom.Person{ Name: "@" + user, URI: "https://twitter.com/" + user, }, } var latest time.Time for _, tweet := range tweets { feed.Entry = append(feed.Entry, tweet.Atomify()) if tweet.Date.After(latest) { latest = tweet.Date } } feed.Updated = atom.Time(latest) enc := xml.NewEncoder(os.Stdout) if err := enc.Encode(feed); err != nil { fmt.Fprintf(os.Stderr, "Could not encode feed: %s\n", err) return 1 } return 0 }