You're viewing a comment by Werner and its responses.

Werner Permalink
February 21, 2008, 18:20

Hi,

youtube changed it again. To download i made a little change in function get_vid_info

#!/usr/bin/gawk -f
#
# 2007.07.10 v1.0 - initial release
# 2007.10.21 v1.1 - youtube changed the way it displays vids
#
# Peter Krumins (peter@catonmat.net)
# http://www.catonmat.net - good coders code, great reuse
#
# Usage: gawk --re-interval -f get_youtube_vids.awk  ...

BEGIN {
    if (ARGC == 1) usage();

    if ("fooooo" !~ "o{5}") {
        print "Error: --re-interval option was not specified!"
        print
        usage();
    }

    BINMODE = 3

    delete ARGV[0]
    print "Parsing YouTube video urls/IDs"
    for (i in ARGV) {
        vid_id = parse_url(ARGV[i])
        if (length(vid_id)  ..."
    exit 1
}

#
# function parse_url
#
# takes a url or an ID of a youtube video and returns just the ID
# for example the url could be the full url: http://www.youtube.com/watch?v=ID
# or it could be www.youtube.com/watch?v=ID
# or just youtube.com/watch?v=ID or http://youtube.com/watch?v=ID
# or just the ID
#
function parse_url(url) {
    gsub(/http:\/\//, "", url)                # get rid of http:// part
    gsub(/www\./,     "", url)                # get rid of www.    part
    gsub(/youtube\.com\/watch\?v=/, "", url)  # get rid of youtube.com... part

    if ((p = index(url, "&")) > 0)      # get rid of &foo=bar&... after the ID
        url = substr(url, 1, p)

    return url
}

#
# function get_vid_info
#
# function takes the youtube video ID and gets the title of the video
# and request string to .flv video file
#
function get_vid_info(vid_id, INFO) {
    YouTube = "/inet/tcp/0/www.youtube.com/80"
    Request = "GET /watch?v=" vid_id " HTTP/1.0\r\n\r\n"

    print Request |& YouTube
    while ((YouTube |& getline) > 0) {
        if (match($0, /"video_id":"([^"]+)".+"t":"([^"]+)"/, matches)) {
            # we found the request string
            #
            INFO["request"] = "video_id=" matches[1] "&t=" matches[2]
        }
        else if (match($0, /YouTube - ([^([^ filename

    # here we will do a little hackery to write the downloaded data
    # to file chunk by chunk instead of downloading it all to memory
    # and then writing
    #
    # the idea is to use a regex for the record field seperator
    # everything that gets matched is stored in RT variable
    # which gets written to disk after each match
    #
    RS = ".{1,512}" # let's read 512 byte records

    while ((Inet |& getline) > 0)
        print RT >> filename

    RS  = OLD_RS
    ORS = OLD_ORS
}

#
# function get_headers
#
# given a special inet file and the request saves headers in HEADERS array
# special key "_status" can be used to find HTTP response code
# issuing another getline() on inet file would start returning the contents
#
function get_headers(Inet, Request, HEADERS) {
    # save global vars
    OLD_RS=RS

    print Request |& Inet

    # get the http status response
    if (Inet |& getline > 0) {
        HEADERS["_status"] = $2
    }
    else {
        print "Failed reading from the net. Quitting!"
        exit 1
    }

    RS="\r\n"
    while ((Inet |& getline) > 0) {
        # we could have used FS=": " to split, but i could think of a good
        # way to handle header values which contain multiple ": "
        # so i better go with a match
        if (match($0, /([^:]+): (.+)/, matches)) {
            HEADERS[matches[1]] = matches[2]
        }
        else { break }
    }
    RS=OLD_RS
}

#
# function parse_location
#
# given a Location HTTP header value the function constructs a special
# inet file and the request storing them in FOO
#
function parse_location(location, FOO) {
    # location might look like http://cache.googlevideo.com/get_video?video_id=ID
    if (match(location, /http:\/\/([^\/]+)(\/.+)/, matches)) {
        FOO["InetFile"] = "/inet/tcp/0/" matches[1] "/80"
        FOO["Host"]     = matches[1]
        FOO["Request"]  = matches[2]
    }
    else {
        FOO["InetFile"] = ""
        FOO["Host"]     = ""
        FOO["Request"]  = ""
    }
}

BR,
Werner.

Reply To This Comment

(why do I need your e-mail?)

(Your twitter handle, if you have one.)

Type the word "security_3": (just to make sure you're a human)

Please preview the comment before submitting to make sure it's OK.