linux 下载 Google Drive 文件
linux
本文字数:1.4k 字 | 阅读时长 ≈ 7 min

linux 下载 Google Drive 文件

linux
本文字数:1.4k 字 | 阅读时长 ≈ 7 min

1. gdown 下载

通常使用 wget 或者 curl 下载 google drive 的文件会失败,这里用 gdown 来下载,Github 官方仓库

1.1 安装

pip install gdown

# to upgrade
pip install --upgrade gdown

1.2 下载 folder 的文件

Failed to retrieve folder contents:

        The gdrive folder with url: https://drive.google.com/drive/folders/1hu
        OL37wNOyMdCzbl8CIvJHDwCu5HLQ5o?hl=en has more than 50 files, gdrive
        can't download more than this limit.

You can use `--remaining-ok` option to ignore this error.

在末尾加入 --remaining-ok 即可,但是加入这条命令以后,会自动下载文件夹下的前五十个文件,并不能下载所有的,这也是 gdown 的一个缺点

function extractLinks() {
  var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
  var data = sheet.getDataRange().getValues();
  var output = [];
  
  for (var i = 0; i < data.length; i++) {
    for (var j = 0; j < data[i].length; j++) {
      var cell = sheet.getRange(i + 1, j + 1);
      var richTextValue = cell.getRichTextValue();
      if (richTextValue) {
        var runs = richTextValue.getRuns();
        for (var k = 0; k < runs.length; k++) {
          var url = runs[k].getLinkUrl();
          if (url) {
            output.push({
              row: i + 1,
              column: j + 1,
              text: runs[k].getText(),
              url: url
            });
          }
        }
      }
    }
  }
  
  // 创建一个新的工作表来输出结果
  var outputSheet = SpreadsheetApp.getActiveSpreadsheet().insertSheet('Extracted Links');
  outputSheet.getRange(1, 1).setValue('Row');
  outputSheet.getRange(1, 2).setValue('Column');
  outputSheet.getRange(1, 3).setValue('Text');
  outputSheet.getRange(1, 4).setValue('URL');
  
  for (var m = 0; m < output.length; m++) {
    outputSheet.getRange(m + 2, 1).setValue(output[m].row);
    outputSheet.getRange(m + 2, 2).setValue(output[m].column);
    outputSheet.getRange(m + 2, 3).setValue(output[m].text);
    outputSheet.getRange(m + 2, 4).setValue(output[m].url);
  }
}

上面脚本会一行一行的写入到 sheet 中,耗时较多(Google 脚本有运行时间限制,大概在 6 分钟),可以用下面脚本,运行一次直接全部写入

function extractLinks() {
  var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
  var data = sheet.getDataRange().getValues();
  var output = [];
  
  for (var i = 0; i < data.length; i++) {
    for (var j = 0; j < data[i].length; j++) {
      var cell = sheet.getRange(i + 1, j + 1);
      var richTextValue = cell.getRichTextValue();
      if (richTextValue) {
        var runs = richTextValue.getRuns();
        for (var k = 0; k < runs.length; k++) {
          var url = runs[k].getLinkUrl();
          if (url) {
            output.push([i + 1, j + 1, runs[k].getText(), url]);
          }
        }
      }
    }
  }
  
  // 创建一个新的工作表来输出结果
  var outputSheet = SpreadsheetApp.getActiveSpreadsheet().insertSheet('Extracted Links');
  var headers = [['Row', 'Column', 'Text', 'URL']];
  var outputData = headers.concat(output);
  
  // 一次性写入所有数据
  outputSheet.getRange(1, 1, outputData.length, outputData[0].length).setValues(outputData);
}
import gdown
import os
from tqdm import tqdm
import pandas as pd

# 把Excel文件中的数据读入pandas
df = pd.read_excel('download.xlsx')
file_list = []  # 防止重复下载
for file in os.listdir("/data/wangyh/zhaolian/TimeChat/DiDeMo/train_videos"):
    file_list.append(file)

for index, row in tqdm(df.iterrows()):
    if row['Text'] in file_list:
        print(row['Text'], ': 存在')
    else:
        gdown.download(url=row['URL'], fuzzy=True)

多线程下载

import gdown
import os
from tqdm import tqdm
import pandas as pd
from multiprocessing import Pool

def split_list_into_parts(lst, num_parts):
    avg = len(lst) / float(num_parts)
    out = []
    last = 0.0

    while last < len(lst):
        out.append(lst[int(last):int(last + avg)])
        last += avg

    return out

all_ready_downloaded = [] # 防止重复下载
for file in os.listdir("/data/wangyh/zhaolian/TimeChat/DiDeMo/train_videos"):
    all_ready_downloaded.append(file)

# 把Excel文件中的数据读入pandas
df = pd.read_excel('download.xlsx')
file_list = []
for index, row in tqdm(df.iterrows()):
    file_list.append(row)

def work1(part):
    for row in tqdm(part):
        if row['Text'] in all_ready_downloaded:
            print(row['Text'], ': 存在')
        else:
            gdown.download(url=row['URL'], fuzzy=True)
    
parts = split_list_into_parts(file_list, 5)  # 将所有文件切片为5份
pool = Pool(5)  # 定义一个进程池,最大进程数为5
for i in range(5):
    pool.apply_async(func=work1, args=(parts[i],))  # 非阻塞方式运行 
print("------start------")
pool.close()  # 关闭进程池,关闭后pool不再接收新的请求任务
pool.join()  # 等待pool进程池中所有的子进程执行完成,必须放在pool.close()之后
print("-----end------")
gdown.exceptions.FileURLRetrievalError: Too many users have viewed or downloaded this file recently. Please try accessing the file again later. If the file you are trying to access is particularly large or is shared with many people, it may take up to 24 hours to be able to view or download the file. If you still can't access a file after 24 hours, contact your domain administrator

此时我们将下载方式改为命令行下载,具体操作为,进入https://developers.google.com/oauthplayground,找到 Drive API v3,点击https://www.googleapis.com/auth/drive.readonly,然后点击 Authorize APIs,点击 Exchange authorization code for tokens,复制生成的 Access Token,并将其替换下面的 Access_Token 部分
其中 FILE_IDFILE_NAME 分别为下载的文件 id 和文件名称

curl -H "Authorization: Bearer ACCESS_TOKEN" https://www.googleapis.com/drive/v3/files/FILE_ID?alt=media -o FILE_NAME

此时我们只需要将上述的 gdown.download 那一行更改为下面这句话即可(access token 我已经替换了)

os.system(f"curl -H \"Authorization: Bearer ya29.a0AXooCguBpWXa6yJ4IMIEjeMJaqt_LCKc6wi0cu6bXWSIS6mGe92KJ4T8bsfsmbUpqcyTiPiZvi5pyNNTjdWl2R2L2nYLGpSdbT8aQiXXwiKFKr6QuB_oWgQMcKlvmtLXYtnja6yw0ghAnvciulSj5m9wc_RSNU49qD1WaCgYKAfISARISFQHGX2Mi4CV-wyzj4uGUlvbnROFgFw0171\" https://www.googleapis.com/drive/v3/files/{file_id}?alt=media -o {row['Text']}")
12月 31, 2024
11月 17, 2024